from dsc80_utils import *

# Pandas Tutor setup
%reload_ext pandas_tutor
%set_pandas_tutor_options {"maxDisplayCols": 8, "nohover": True, "projectorMode": True}

dogs = pd.read_csv(Path('data') / 'dogs43.csv', index_col='breed')

dogs.assign(cost_per_year=dogs['lifetime_cost'] / dogs['longevity'])

dogs

# Finds the rows corresponding to the five cheapest to own breeds on a per-year basis.
(dogs
 .assign(cost_per_year=dogs['lifetime_cost'] / dogs['longevity'])
 .sort_values('cost_per_year')
 .iloc[:5]
)

dogs.assign(**{'cost per year 💵': dogs['lifetime_cost'] / dogs['longevity']})

# By default, .copy() returns a deep copy of the object it is called on,
# meaning that if you change the copy the original remains unmodified.
dogs_copy = dogs.copy()
dogs_copy.head(2)

dogs_copy['cost_per_year'] = dogs_copy['lifetime_cost'] / dogs_copy['longevity']
dogs_copy

dogs_copy

def cost_in_thousands():
    dogs_copy['lifetime_cost'] = dogs_copy['lifetime_cost'] / 1000

# What happens when we run this twice?
cost_in_thousands()

dogs_copy

dogs['lifetime_cost']

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
                           ...   
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Name: lifetime_cost, Length: 43, dtype: float64

dogs['lifetime_cost'].to_numpy()

array([22589., 21992., 18993., ..., 13936., 13581., 20022.])

dogs

# Max element in each column.
dogs.max()

kind             working
lifetime_cost    26686.0
longevity           16.5
size               small
weight             175.0
height              30.0
dtype: object

# Max element in each row – a little nonsensical, since there are different types in each row.
dogs.max(axis=1)

/var/folders/63/35_wxty956bfzx41wxtfm3pc0000gn/T/ipykernel_47912/342781375.py:2: FutureWarning:

Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
                           ...   
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Length: 43, dtype: float64

# The number of unique values in each column.
dogs.nunique()

kind              7
lifetime_cost    43
longevity        40
size              3
weight           37
height           30
dtype: int64

# describe doesn't accept an axis argument; it works on every numeric column in the DataFrame it is called on.
dogs.describe()

IFrame('https://www.youtube-nocookie.com/embed/CCrNAHXUstU?si=-DntSyUNp5Kwitjm&amp;start=11',
       width=560, height=315)

import seaborn as sns
penguins = sns.load_dataset('penguins').dropna()
penguins

penguins['body_mass_g'].mean()

4207.057057057057

# ???

species_map = pd.Series([], dtype=float)

for species in penguins['species'].unique():
    species_only = penguins.loc[penguins['species'] == species]
    species_map.loc[species] = species_only['body_mass_g'].mean()
    
species_map

Adelie       3706.16
Chinstrap    3733.09
Gentoo       5092.44
dtype: float64

# Before:
penguins['body_mass_g'].mean()

4207.057057057057

# After:
penguins.groupby('species')['body_mass_g'].mean()

species
Adelie       3706.16
Chinstrap    3733.09
Gentoo       5092.44
Name: body_mass_g, dtype: float64

%%pt

penguins.groupby('species')['body_mass_g'].mean()

# Fill this in, then respond on q.dsc80.com

penguins.groupby('species').mean()

penguins.groupby('species')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fd030c3de50>

# Simplified DataFrame for demonstration:
penguins_small = penguins.iloc[[0, 150, 300, 1, 251, 151, 301], [0, 5, 6]]
penguins_small

# Creates one group for each unique value in the species column.
penguin_groups = penguins_small.groupby('species')
penguin_groups

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fd0103fab20>

%%pt
penguin_groups

penguin_groups.groups

{'Adelie': [0, 1], 'Chinstrap': [156, 157], 'Gentoo': [308, 258, 309]}

penguin_groups.get_group('Chinstrap')

# Same as the above!
penguins_small.query('species == "Chinstrap"')

penguins_small

penguins_small.groupby('species').mean()

penguins_small.groupby('species').sum()

penguins_small.groupby('species').last()

penguins_small.groupby('species').max()

penguins_small.groupby('species').max()

# This penguin is Female!
penguins_small.loc[(penguins['species'] == 'Adelie') & (penguins['body_mass_g'] == 3800.0)]

# Your code goes here.

# Back to the big penguins dataset!
penguins.groupby('species').mean()

# Works, but involves wasted effort since the other columns had to be aggregated for no reason.
penguins.groupby('species').mean()['bill_length_mm']

species
Adelie       38.82
Chinstrap    48.83
Gentoo       47.57
Name: bill_length_mm, dtype: float64

# This is a SeriesGroupBy object!
penguins.groupby('species')['bill_length_mm']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fd0202c9280>

# Saves time!
penguins.groupby('species')['bill_length_mm'].mean()

species
Adelie       38.82
Chinstrap    48.83
Gentoo       47.57
Name: bill_length_mm, dtype: float64

%%timeit
penguins.groupby('species').mean()['bill_length_mm']

364 µs ± 6.34 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

%%timeit
penguins.groupby('species')['bill_length_mm'].mean()

139 µs ± 1.15 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

%%timeit
species_map = pd.Series([], dtype=float)

for species in penguins['species'].unique():
    species_only = penguins.loc[penguins['species'] == species]
    species_map.loc[species] = species_only['body_mass_g'].mean()
    
species_map

997 µs ± 18 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

# Slower
penguins.groupby('species').mean()['bill_length_mm']

# Faster
penguins.groupby('species')['bill_length_mm'].mean()

(penguins
 .groupby('species')
 ['body_mass_g']
 .aggregate(['count', 'mean'])
)

(penguins
 .groupby('species')
 .aggregate(['count', 'mean'])
)

(penguins
 .groupby('species')
 .aggregate({'bill_length_mm': 'max', 'island': 'unique'})
)

# Here, the argument to agg is a function,
# which takes in a pd.Series and returns a scalar.

def iqr(s):
    return np.percentile(s, 75) - np.percentile(s, 25)

(penguins
 .groupby('species')
 ['body_mass_g']
 .agg(iqr)
)

species
Adelie       637.5
Chinstrap    462.5
Gentoo       800.0
Name: body_mass_g, dtype: float64

def z_score(x):
    return (x - x.mean()) / x.std(ddof=0)

z_score(penguins['body_mass_g'])

0     -0.57
1     -0.51
2     -1.19
       ... 
341    1.92
342    1.23
343    1.48
Name: body_mass_g, Length: 333, dtype: float64

z_mass = (penguins
          .groupby('species')
          ['body_mass_g']
          .transform(z_score))
z_mass

0      0.10
1      0.21
2     -1.00
       ... 
341    1.32
342    0.22
343    0.62
Name: body_mass_g, Length: 333, dtype: float64

penguins.assign(z_mass=z_mass)

display_df(penguins.assign(z_mass=z_mass), rows=8)

penguins.groupby('species')['body_mass_g'].mean()

species
Adelie       3706.16
Chinstrap    3733.09
Gentoo       5092.44
Name: body_mass_g, dtype: float64

(penguins
 .groupby('species')
 .filter(lambda df: df['bill_length_mm'].mean() > 39)
)

(penguins
 .groupby('species')
 .filter(lambda df: df.shape[0] > 100)
)

species_and_island = penguins.groupby(['species', 'island']).mean()
species_and_island

species_and_island

species_and_island['body_mass_g']

species    island   
Adelie     Biscoe       3709.66
           Dream        3701.36
           Torgersen    3708.51
Chinstrap  Dream        3733.09
Gentoo     Biscoe       5092.44
Name: body_mass_g, dtype: float64

species_and_island.loc['Adelie']

species_and_island.loc[('Adelie', 'Torgersen')]

bill_length_mm         39.04
bill_depth_mm          18.45
flipper_length_mm     191.53
body_mass_g          3708.51
Name: (Adelie, Torgersen), dtype: float64

species_and_island.reset_index()

penguins.groupby(['species', 'island'], as_index=False).mean()

baby_path = Path('data') / 'baby.csv'
baby = pd.read_csv(baby_path)
baby

# Your code goes here.

df.pivot_table(index=index_col,
               columns=columns_col,
               values=values_col,
               aggfunc=func)

last_5_years = baby.query('Year >= 2018')
last_5_years

last_5_years.pivot_table(
    index='Year',
    columns='Sex',
    values='Count',
    aggfunc='sum',
)

# Look at the similarity to the snippet above!
(last_5_years
 .groupby(['Year', 'Sex'])
 [['Count']]
 .sum()
)

penguins

penguins.pivot_table(
    index='species', 
    columns='island', 
    values='bill_length_mm', # Choice of column here doesn't actually matter!
    aggfunc='count',
)

penguins.pivot_table(
    index='species', 
    columns='island', 
    values='bill_length_mm', 
    aggfunc='count',
    fill_value=0,
)

penguins.pivot_table(
    index='species', 
    columns='island', 
    values='bill_length_mm', 
    aggfunc='count',
    fill_value=0,
)

counts = penguins.pivot_table(
    index='species', 
    columns='sex', 
    values='body_mass_g', 
    aggfunc='count', 
    fill_value=0
)
counts

joint = counts / counts.sum().sum()
joint

joint

# Recall, joint.sum(axis=0) sums across the rows, 
# which computes the sum of the **columns**.
joint.sum(axis=0)

sex
Female    0.5
Male      0.5
dtype: float64

joint.sum(axis=1)

species
Adelie       0.44
Chinstrap    0.20
Gentoo       0.36
dtype: float64

counts

counts

counts.sum(axis=0)

sex
Female    165
Male      168
dtype: int64

counts / counts.sum(axis=0)

# Your code goes here.

	kind	lifetime_cost	longevity	size	weight	height	cost_per_year
breed
Maltese	toy	19084.0	12.25	small	5.0	9.00	1557.88
Lhasa Apso	non-sporting	22031.0	13.92	small	15.0	10.50	1582.69
Cairn Terrier	terrier	21992.0	13.84	small	14.0	10.00	1589.02
Chihuahua	toy	26250.0	16.50	small	5.5	5.00	1590.91
Shih Tzu	toy	21152.0	13.20	small	12.5	9.75	1602.42

	kind	lifetime_cost	longevity	size	weight	height	cost_per_year
breed
Brittany	sporting	22.59	12.92	medium	35.0	19.0	1748.37
Cairn Terrier	terrier	21.99	13.84	small	14.0	10.0	1589.02
English Cocker Spaniel	sporting	18.99	11.66	medium	30.0	16.0	1628.90
...	...	...	...	...	...	...	...
Bullmastiff	working	13.94	7.57	large	115.0	25.5	1840.95
Mastiff	working	13.58	6.50	large	175.0	30.0	2089.38
Saint Bernard	working	20.02	7.78	large	155.0	26.5	2573.52

	lifetime_cost	longevity	weight	height
count	43.00	43.00	43.00	43.00
mean	20532.84	11.34	49.35	18.34
std	3290.78	2.05	39.42	6.83
...	...	...	...	...
50%	21006.00	11.81	36.50	18.50
75%	22072.50	12.52	67.50	25.00
max	26686.00	16.50	175.00	30.00

	Name	Sex	Count	Year
0	Liam	M	20456	2022
1	Noah	M	18621	2022
2	Olivia	F	16573	2022
...	...	...	...	...
2085155	Wright	M	5	1880
2085156	York	M	5	1880
2085157	Zachariah	M	5	1880

Sex	F	M
Year
2018	1698373	1813377
2019	1675139	1790682
2020	1612393	1721588
2021	1635800	1743913
2022	1628730	1733166

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
...	...	...	...	...	...	...	...
341	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
342	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
343	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g
species
Adelie	38.82	18.35	190.10	3706.16
Chinstrap	48.83	18.42	195.82	3733.09
Gentoo	47.57	15.00	217.24	5092.44

	bill_length_mm	island
species
Adelie	46.0	[Torgersen, Biscoe, Dream]
Chinstrap	58.0	[Dream]
Gentoo	59.6	[Biscoe]

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
152	Chinstrap	Dream	46.5	17.9	192.0	3500.0	Female
153	Chinstrap	Dream	50.0	19.5	196.0	3900.0	Male
154	Chinstrap	Dream	51.3	19.2	193.0	3650.0	Male
...	...	...	...	...	...	...	...
341	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
342	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
343	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

		bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g
species	island
Adelie	Biscoe	38.98	18.37	188.80	3709.66
	Dream	38.52	18.24	189.93	3701.36
	Torgersen	39.04	18.45	191.53	3708.51
Chinstrap	Dream	48.83	18.42	195.82	3733.09
Gentoo	Biscoe	47.57	15.00	217.24	5092.44

island	Biscoe	Dream	Torgersen
species
Adelie	44.0	55.0	47.0
Chinstrap	NaN	68.0	NaN
Gentoo	119.0	NaN	NaN

Lecture 3 – Aggregating¶

DSC 80, Spring 2024¶

Announcements 📣¶

Agenda¶

Question 🤔 (Answer at q.dsc80.com)

Adding and modifying columns¶

Adding and modifying columns, using a copy¶

💡 Pro-Tip: Method chaining¶

💡 Pro-Tip: assign for column names with special characters¶

Adding and modifying columns, in-place¶

Mutability¶

⚠️ Avoid mutation when possible¶

pandas and numpy¶

pandas is built upon numpy!¶

Axes¶

DataFrame methods with axis¶

Data granularity and the groupby method¶

Example: Palmer Penguins¶

Granularity¶

Aggregating¶

Naive approach: looping through unique values¶

Grouping¶

"Split-apply-combine" paradigm¶

More examples¶

Question 🤔 (Answer at q.dsc80.com)

DataFrameGroupBy objects and aggregation¶

DataFrameGroupBy objects¶

Peeking under the hood¶

Aggregation¶

Column independence¶

Question 🤔 (Answer at q.dsc80.com)

Column selection and performance implications¶

Takeaways¶

Beyond default aggregation methods¶

The aggregate method¶

Example¶

Example¶

Example¶

Question 🤔 (Answer at q.dsc80.com)

Other DataFrameGroupBy methods¶

Split-apply-combine, revisited¶

Transformations¶

Transformations within groups¶

Filtering groups¶

Question 🤔 (Answer at q.dsc80.com)

Grouping with multiple columns¶

Grouping and indexes¶

Question 🤔 (Answer at q.dsc80.com)

Pivot tables using the pivot_table method¶

Pivot tables: an extension of grouping¶

pivot_table¶

Example¶

Granularity, revisited¶

Reshaping¶

Question 🤔 (Answer at q.dsc80.com)

Distributions¶

Joint distribution¶

Marginal probabilities¶

Conditional probabilities¶

Conditional probabilities¶

Question 🤔 (Answer at q.dsc80.com)

Summary, next time¶

Summary¶

Next time¶

💡 Pro-Tip: `assign` for column names with special characters¶

`pandas` and `numpy`¶

`pandas` is built upon `numpy`!¶

DataFrame methods with `axis`¶

Data granularity and the `groupby` method¶

`DataFrameGroupBy` objects and aggregation¶

`DataFrameGroupBy` objects¶

The `aggregate` method¶

Other `DataFrameGroupBy` methods¶

Pivot tables using the `pivot_table` method¶

`pivot_table`¶