from dsc80_utils import *

def show_paradox_slides():
    src = 'https://docs.google.com/presentation/d/e/2PACX-1vSbFSaxaYZ0NcgrgqZLvjhkjX-5MQzAITWAsEFZHnix3j1c0qN8Vd1rogTAQP7F7Nf5r-JWExnGey7h/embed?start=false&rm=minimal'
    width = 960
    height = 569
    display(IFrame(src, width, height))

# Pandas Tutor setup
%reload_ext pandas_tutor
%set_pandas_tutor_options {"maxDisplayCols": 8, "nohover": True, "projectorMode": True}

import seaborn as sns
penguins = sns.load_dataset('penguins').dropna()
penguins

def z_score(x):
    return (x - x.mean()) / x.std(ddof=0)

z_score(penguins['body_mass_g'])

0     -0.57
1     -0.51
2     -1.19
       ... 
341    1.92
342    1.23
343    1.48
Name: body_mass_g, Length: 333, dtype: float64

z_mass = (penguins
          .groupby('species')
          ['body_mass_g']
          .transform(z_score))
z_mass

0      0.10
1      0.21
2     -1.00
       ... 
341    1.32
342    0.22
343    0.62
Name: body_mass_g, Length: 333, dtype: float64

penguins.assign(z_mass=z_mass)

display_df(penguins.assign(z_mass=z_mass), rows=8)

penguins.groupby('species')['body_mass_g'].mean()

species
Adelie       3706.16
Chinstrap    3733.09
Gentoo       5092.44
Name: body_mass_g, dtype: float64

(penguins
 .groupby('species')
 .filter(lambda df: df['bill_length_mm'].mean() > 39)
)

(penguins
 .groupby('species')
 .filter(lambda df: df.shape[0] > 100)
)

penguins

species_and_island = (
    penguins
    .groupby(['species', 'island'])
    [['bill_length_mm', 'body_mass_g']]
    .mean()
)
species_and_island

species_and_island

species_and_island['body_mass_g']

species    island   
Adelie     Biscoe       3709.66
           Dream        3701.36
           Torgersen    3708.51
Chinstrap  Dream        3733.09
Gentoo     Biscoe       5092.44
Name: body_mass_g, dtype: float64

species_and_island.loc['Adelie']

species_and_island.loc[('Adelie', 'Torgersen')]

bill_length_mm      39.04
body_mass_g       3708.51
Name: (Adelie, Torgersen), dtype: float64

species_and_island.reset_index()

(penguins
 .groupby(['species', 'island'], as_index=False)
 [['bill_length_mm', 'body_mass_g']]
 .mean()
)

baby_path = Path('data') / 'baby.csv'
baby = pd.read_csv(baby_path)
baby

# Your code goes here.

df.pivot_table(index=index_col,
               columns=columns_col,
               values=values_col,
               aggfunc=func)

last_5_years = baby.query('Year >= 2018')
last_5_years

last_5_years.pivot_table(
    index='Year',
    columns='Sex',
    values='Count',
    aggfunc='sum',
)

# Look at the similarity to the snippet above!
(last_5_years
 .groupby(['Year', 'Sex'])
 [['Count']]
 .sum()
)

penguins

penguins.pivot_table(
    index='species',
    columns='island',
    values='bill_length_mm', # Choice of column here doesn't actually matter!
    aggfunc='count',
)

penguins.pivot_table(
    index='species',
    columns='island',
    values='bill_length_mm',
    aggfunc='count',
    fill_value=0,
)

penguins.pivot_table(
    index='species',
    columns='island',
    values='bill_length_mm',
    aggfunc='count',
    fill_value=0,
)

counts = penguins.pivot_table(
    index='species',
    columns='sex',
    values='body_mass_g',
    aggfunc='count',
    fill_value=0
)
counts

joint = counts / counts.sum().sum()
joint

joint

# Recall, joint.sum(axis=0) sums across the rows,
# which computes the sum of the **columns**.
joint.sum(axis=0)

sex
Female    0.5
Male      0.5
dtype: float64

joint.sum(axis=1)

species
Adelie       0.44
Chinstrap    0.20
Gentoo       0.36
dtype: float64

counts

counts

counts.sum(axis=0)

sex
Female    165
Male      168
dtype: int64

counts / counts.sum(axis=0)

# Your code goes here.

lisa = pd.DataFrame([[20, 46], [18, 54], [5, 20]],
    columns=['Units', 'Grade Points Earned'],
    index=['Fall', 'Winter', 'Spring'],
)
lisa.columns.name = 'Lisa' # This allows us to see the name "Lisa" in the top left of the DataFrame.

bart = pd.DataFrame([[5, 10], [5, 13.5], [22, 81.4]],
    columns=['Units', 'Grade Points Earned'],
    index=['Fall', 'Winter', 'Spring'],
)
bart.columns.name = 'Bart'

dfs_side_by_side(lisa, bart)

quarterly_gpas = pd.DataFrame({
    "Lisa's Quarter GPA": lisa['Grade Points Earned'] / lisa['Units'],
    "Bart's Quarter GPA": bart['Grade Points Earned'] / bart['Units'],
})

quarterly_gpas

# Helper function to show lisa and bart side-by-side to save screen space
dfs_side_by_side(lisa, bart)

# Your code goes here.

(quarterly_gpas
 .assign(Lisa_Units=lisa['Units'],
         Bart_Units=bart['Units'])
 .iloc[:, [0, 2, 1, 3]]
)

show_paradox_slides()

IFrame('https://www.youtube-nocookie.com/embed/zeuW1Z2EtLs?si=l2Dl7P-5RCq3ODpo',
       width=800, height=450)

	Name	Sex	Count	Year
0	Liam	M	20456	2022
1	Noah	M	18621	2022
2	Olivia	F	16573	2022
...	...	...	...	...
2085155	Wright	M	5	1880
2085156	York	M	5	1880
2085157	Zachariah	M	5	1880

Sex	F	M
Year
2018	1698373	1813377
2019	1675139	1790682
2020	1612393	1721588
2021	1635800	1743913
2022	1628730	1733166

	Name	Sex	Count	Year
0	Liam	M	20456	2022
1	Noah	M	18621	2022
2	Olivia	F	16573	2022
...	...	...	...	...
159444	Zyrie	M	5	2018
159445	Zyron	M	5	2018
159446	Zzyzx	M	5	2018

Sex	F	M
Year
2018	1698373	1813377
2019	1675139	1790682
2020	1612393	1721588
2021	1635800	1743913
2022	1628730	1733166

		Count
Year	Sex
2018	F	1698373
2018	M	1813377
2019	F	1675139
...	...	...
2021	M	1743913
2022	F	1628730
2022	M	1733166

Lecture 3 – Aggregating, Simpsons Paradox¶

Other `DataFrameGroupBy` methods¶

Split-apply-combine, revisited¶

Transformations¶

Transformations within groups¶

Filtering groups¶

Question 🤔

Grouping with multiple columns¶

Grouping and indexes¶

Question 🤔

Pivot tables using the `pivot_table` method¶

Pivot tables: an extension of grouping¶

`pivot_table`¶

Example¶

Granularity, revisited¶

Reshaping¶

Distributions¶

Joint distribution¶

Marginal probabilities¶

Conditional probabilities¶

Conditional probabilities¶

Question 🤔

Simpson's paradox¶

Example: Grades¶

Quarter-specific vs. overall GPAs¶

Question 🤔 (Answer at dsc80.com/q)

What happened?¶

Simpson's paradox¶

Example: How Berkeley was almost sued for gender discrimination (1973)¶

What happened?¶

Example: Restaurant reviews and phone types¶

Rule of thumb 👍¶

Takeaways¶

Really?¶

Further reading¶

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
...	...	...	...	...	...	...	...
341	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
342	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
343	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
152	Chinstrap	Dream	46.5	17.9	192.0	3500.0	Female
153	Chinstrap	Dream	50.0	19.5	196.0	3900.0	Male
154	Chinstrap	Dream	51.3	19.2	193.0	3650.0	Male
...	...	...	...	...	...	...	...
341	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
342	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
343	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

		bill_length_mm	body_mass_g
species	island
Adelie	Biscoe	38.98	3709.66
	Dream	38.52	3701.36
	Torgersen	39.04	3708.51
Chinstrap	Dream	48.83	3733.09
Gentoo	Biscoe	47.57	5092.44

island	Biscoe	Dream	Torgersen
species
Adelie	44.0	55.0	47.0
Chinstrap	NaN	68.0	NaN
Gentoo	119.0	NaN	NaN

Lisa	Units	Grade Points Earned
Fall	20	46
Winter	18	54
Spring	5	20

Bart	Units	Grade Points Earned
Fall	5	10.0
Winter	5	13.5
Spring	22	81.4

	Lisa's Quarter GPA	Bart's Quarter GPA
Fall	2.3	2.0
Winter	3.0	2.7
Spring	4.0	3.7

Lisa	Units	Grade Points Earned
Fall	20	46
Winter	18	54
Spring	5	20

Lecture 3 – Aggregating, Simpsons Paradox¶

Other DataFrameGroupBy methods¶

Split-apply-combine, revisited¶

Transformations¶

Transformations within groups¶

Filtering groups¶

Question 🤔

Grouping with multiple columns¶

Grouping and indexes¶

Question 🤔

Pivot tables using the pivot_table method¶

Pivot tables: an extension of grouping¶

pivot_table¶

Example¶

Granularity, revisited¶

Reshaping¶

Distributions¶

Joint distribution¶

Marginal probabilities¶

Conditional probabilities¶

Conditional probabilities¶

Question 🤔

Simpson's paradox¶

Example: Grades¶

Quarter-specific vs. overall GPAs¶

Question 🤔 (Answer at dsc80.com/q)

What happened?¶

Simpson's paradox¶

Example: How Berkeley was almost sued for gender discrimination (1973)¶

What happened?¶

Example: Restaurant reviews and phone types¶

Rule of thumb 👍¶

Takeaways¶

Really?¶

Further reading¶

Other `DataFrameGroupBy` methods¶

Pivot tables using the `pivot_table` method¶

`pivot_table`¶