import pandas as pd
import numpy as np

import warnings
warnings.simplefilter('ignore')


import seaborn as sns
penguins = sns.load_dataset('penguins').dropna()
penguins.head()


penguins['species'].value_counts()

Adelie       146
Gentoo       119
Chinstrap     68
Name: species, dtype: int64


penguins['island'].value_counts()

Biscoe       163
Dream        123
Torgersen     47
Name: island, dtype: int64


penguins.head()


species_map = pd.Series([], dtype=float)

for species in penguins['species'].unique():
    species_only = penguins.loc[penguins['species'] == species]
    species_map.loc[species] = species_only['body_mass_g'].mean()
    
species_map

Adelie       3706.164384
Chinstrap    3733.088235
Gentoo       5092.436975
dtype: float64


penguins.head()


penguins.groupby('species').mean()


%reload_ext pandas_tutor


%%pt

penguins.groupby('species').mean()


penguins.head()


penguins.groupby('species').median()['bill_length_mm'].idxmax()

'Chinstrap'


(
    penguins.assign(on_Dream=penguins['island'] == 'Dream')
            .groupby('species')
            .mean()['on_Dream']
)

species
Adelie       0.376712
Chinstrap    1.000000
Gentoo       0.000000
Name: on_Dream, dtype: float64


penguins.groupby('species').mean()


penguins.groupby('species')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ff2a92b2e80>


# Creates one group for each unique value in the species column.
penguin_groups = penguins.groupby('species')
penguin_groups

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ff2a932b6a0>


penguin_groups.groups

{'Adelie': [0, 1, 2, 4, 5, 6, 7, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, ...], 'Chinstrap': [152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219], 'Gentoo': [220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, ...]}


penguin_groups.get_group('Chinstrap')


# Same as the above!
penguins[penguins['species'] == 'Chinstrap']


penguins.head()


penguins.groupby('species').mean()


penguins.groupby('species').sum()


penguins.groupby('species').last()


penguins.groupby('species').max()


penguins.groupby('species').max()


# This penguin is on Biscoe island, not Torgersen!
penguins.loc[(penguins['species'] == 'Adelie') & (penguins['body_mass_g'] == 4775.0)]


penguins.sort_values('body_mass_g', ascending=False).groupby('species').first()


penguins.groupby('species').mean()


# Works, but involves wasted effort since the other columns had to be aggregated for no reason.
penguins.groupby('species').mean()['bill_length_mm']

species
Adelie       38.823973
Chinstrap    48.833824
Gentoo       47.568067
Name: bill_length_mm, dtype: float64


# This is a SeriesGroupBy object!
penguins.groupby('species')['bill_length_mm']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7ff2cbc18f40>


# Saves time!
penguins.groupby('species')['bill_length_mm'].mean()

species
Adelie       38.823973
Chinstrap    48.833824
Gentoo       47.568067
Name: bill_length_mm, dtype: float64


%%timeit
penguins.groupby('species').mean()['bill_length_mm']

452 µs ± 1.29 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


%%timeit
penguins.groupby('species')['bill_length_mm'].mean()

194 µs ± 901 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


%%timeit
species_map = pd.Series([], dtype=float)

for species in penguins['species'].unique():
    species_only = penguins.loc[penguins['species'] == species]
    species_map.loc[species] = species_only['body_mass_g'].mean()
    
species_map

1.3 ms ± 3.19 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


penguins.groupby('species')['body_mass_g'].aggregate(['count', 'mean'])


penguins.groupby('species').aggregate(['count', 'mean'])


penguins.groupby('species').aggregate({'bill_length_mm': 'max', 'island': 'unique'})


(
    penguins.groupby('species')['body_mass_g']
            .aggregate(lambda col: np.percentile(col, 75) - np.percentile(col, 25))
)

species
Adelie       637.5
Chinstrap    462.5
Gentoo       800.0
Name: body_mass_g, dtype: float64


z_score = lambda x: (x - x.mean()) / x.std(ddof=0)


penguins.head()


z_scored = penguins.groupby('species').transform(z_score)
z_scored.head()


np.random.seed(1)
penguins.assign(z_mass=z_scored['body_mass_g']).sample(5)


penguins.head()


penguins.groupby('species').filter(lambda df: df['bill_length_mm'].mean() > 39)


penguins.groupby('species').filter(lambda df: df.shape[0] > 100)


species_and_island = penguins.groupby(['species', 'island']).mean()
species_and_island


species_and_island


species_and_island['body_mass_g']

species    island   
Adelie     Biscoe       3709.659091
           Dream        3701.363636
           Torgersen    3708.510638
Chinstrap  Dream        3733.088235
Gentoo     Biscoe       5092.436975
Name: body_mass_g, dtype: float64


species_and_island.loc['Adelie']


species_and_island.loc[('Adelie', 'Torgersen')]

bill_length_mm         39.038298
bill_depth_mm          18.451064
flipper_length_mm     191.531915
body_mass_g          3708.510638
Name: (Adelie, Torgersen), dtype: float64


species_and_island.reset_index()


penguins.groupby(['species', 'island'], as_index=False).mean()

Name	Assignment	Score
Billy	Homework 1	94
Sally	Homework 1	98
Molly	Homework 1	82
Sally	Homework 2	47

Student Name	Quarter	Course	Instructor	Recommend?	Expected Grade	Hours Per Week	Comments
Billy	WI23	DSC 80	Suraj Rampure	No	A-	14	I hate this class
Billy	WI23	DSC 40B	Justin Eldridge	Yes	B+	9	go big O
Sally	WI23	DSC 10	Janine Tiefenbruck	Yes	A	11	baby pandas are so cute
Molly	WI23	DSC 80	Suraj Rampure	Yes	A+	2	I wish there was music in class
Molly	WI23	DSC 95	Marina Langlois	No	A	3	I loved DSC 30, but 95 wasn't hard enough :(

Quarter	Course	Instructor	Recommend (%)	Expected Grade	Hours Per Week
WI23	DSC 80	Suraj Rampure	6%	3.15 (B)	13.32
WI23	DSC 40B	Justin Eldridge	89%	3.35 (B+)	8.54
WI23	DSC 10	Janine Tiefenbruck	94%	3.45 (B+)	11.49
WI23	DSC 95	Marina Langlois	91%	4.0 (A)	9.21

Quarter	Department	Recommend (%)	Expected Grade	Hours Per Week
WI23	DSC	91%	3.01 (B)	12.29
WI23	BILD	85%	2.78 (C+)	13.21

University	Recommend (%)	Average GPA	Hours Per Week
UC San Diego	94%	3.12 (B)	42.19
UC Irvine	89%	3.15 (B)	38.44
SDSU	88%	2.99 (B-)	36.89

Lecture 4 – Grouping¶

DSC 80, Winter 2023¶

Announcements¶

Agenda¶

Data granularity¶

Granularity¶

Levels of granularity¶

Collecting data¶

Manipulating granularity¶

Example: Penguins¶

Discussion Question¶

Naive approach: looping through unique values¶

Grouping¶

🤔¶

Aside: Pandas Tutor¶

Split-apply-combine¶

More examples¶

`DataFrameGroupBy` objects and aggregation¶

`DataFrameGroupBy` objects¶

Peeking under the hood¶

Aggregation¶

Column independence¶

Discussion Question¶

Column selection and performance implications¶

Takeaways¶

Beyond default aggregation methods¶

The `aggregate` method¶

Example¶

Example¶

Example¶

Other `DataFrameGroupBy` methods¶

Split-apply-combine, revisited¶

Transformations¶

Filtering¶

Grouping with multiple columns¶

Grouping and indexes¶

Summary, next time¶

Summary, next time¶

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female
5	Adelie	Torgersen	39.3	20.6	190.0	3650.0	Male

	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g
species
Adelie	38.823973	18.347260	190.102740	3706.164384
Chinstrap	48.833824	18.420588	195.823529	3733.088235
Gentoo	47.568067	14.996639	217.235294	5092.436975

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
152	Chinstrap	Dream	46.5	17.9	192.0	3500.0	Female
153	Chinstrap	Dream	50.0	19.5	196.0	3900.0	Male
154	Chinstrap	Dream	51.3	19.2	193.0	3650.0	Male
155	Chinstrap	Dream	45.4	18.7	188.0	3525.0	Female
156	Chinstrap	Dream	52.7	19.8	197.0	3725.0	Male
...	...	...	...	...	...	...	...
215	Chinstrap	Dream	55.8	19.8	207.0	4000.0	Male
216	Chinstrap	Dream	43.5	18.1	202.0	3400.0	Female
217	Chinstrap	Dream	49.6	18.2	193.0	3775.0	Male
218	Chinstrap	Dream	50.8	19.0	210.0	4100.0	Male
219	Chinstrap	Dream	50.2	18.7	198.0	3775.0	Female

	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g
species
Adelie	5668.3	2678.7	27755.0	541100.0
Chinstrap	3320.7	1252.6	13316.0	253850.0
Gentoo	5660.6	1784.6	25851.0	606000.0

	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
species
Adelie	Biscoe	43.2	19.0	197.0	4775.0	Male
Chinstrap	Dream	52.0	20.7	210.0	4800.0	Male
Gentoo	Biscoe	49.2	15.2	221.0	6300.0	Male

	bill_length_mm		bill_depth_mm		flipper_length_mm		body_mass_g
	count	mean	count	mean	count	mean	count	mean
species
Adelie	146	38.823973	146	18.347260	146	190.102740	146	3706.164384
Chinstrap	68	48.833824	68	18.420588	68	195.823529	68	3733.088235
Gentoo	119	47.568067	119	14.996639	119	217.235294	119	5092.436975

	bill_length_mm	island
species
Adelie	46.0	[Torgersen, Biscoe, Dream]
Chinstrap	58.0	[Dream]
Gentoo	59.6	[Biscoe]

	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g
0	0.104025	0.290284	-1.400540	0.095911
1	0.254772	-0.779539	-0.631244	0.205309
2	0.556265	-0.285774	0.753488	-0.998069
4	-0.800453	0.784048	0.445770	-0.560477
5	0.179399	1.853870	-0.015807	-0.122885

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex	z_mass
65	Adelie	Biscoe	41.6	18.0	192.0	3950.0	Male	0.533502
276	Gentoo	Biscoe	43.8	13.9	208.0	4300.0	Female	-1.586890
186	Chinstrap	Dream	49.7	18.6	195.0	3600.0	Male	-0.348856
198	Chinstrap	Dream	50.1	17.9	190.0	3400.0	Female	-0.873105
293	Gentoo	Biscoe	46.5	14.8	217.0	5200.0	Female	0.215400

		bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g
species	island
Adelie	Biscoe	38.975000	18.370455	188.795455	3709.659091
	Dream	38.520000	18.240000	189.927273	3701.363636
	Torgersen	39.038298	18.451064	191.531915	3708.510638
Chinstrap	Dream	48.833824	18.420588	195.823529	3733.088235
Gentoo	Biscoe	47.568067	14.996639	217.235294	5092.436975

Lecture 4 – Grouping¶

DSC 80, Winter 2023¶

Announcements¶

Agenda¶

Data granularity¶

Granularity¶

Levels of granularity¶

Collecting data¶

Manipulating granularity¶

Example: Penguins¶

Discussion Question¶

Naive approach: looping through unique values¶

Grouping¶

🤔¶

Aside: Pandas Tutor¶

Split-apply-combine¶

More examples¶

DataFrameGroupBy objects and aggregation¶

DataFrameGroupBy objects¶

Peeking under the hood¶

Aggregation¶

Column independence¶

Discussion Question¶

Column selection and performance implications¶

Takeaways¶

Beyond default aggregation methods¶

The aggregate method¶

Example¶

Example¶

Example¶

Other DataFrameGroupBy methods¶

Split-apply-combine, revisited¶

Transformations¶

Filtering¶

Grouping with multiple columns¶

Grouping and indexes¶

Summary, next time¶

Summary, next time¶

`DataFrameGroupBy` objects and aggregation¶

`DataFrameGroupBy` objects¶

The `aggregate` method¶

Other `DataFrameGroupBy` methods¶