import pandas as pd
import numpy as np
import os
import seaborn as sns

from IPython.display import display, IFrame
def show_paradox_slides():
    src = 'https://docs.google.com/presentation/d/e/2PACX-1vSbFSaxaYZ0NcgrgqZLvjhkjX-5MQzAITWAsEFZHnix3j1c0qN8Vd1rogTAQP7F7Nf5r-JWExnGey7h/embed?start=false'
    width = 960
    height = 569
    display(IFrame(src, width, height))


penguins = sns.load_dataset('penguins').dropna()
penguins.head()


# Why doesn't this work?
penguins.groupby('species').max()


penguins.sort_values('body_mass_g', ascending=False).groupby('species').first()


double_group = penguins.groupby(['species', 'island'])
double_group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fe2d0178b20>


double_group.groups

{('Adelie', 'Biscoe'): [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115], ('Adelie', 'Dream'): [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151], ('Adelie', 'Torgersen'): [0, 1, 2, 4, 5, 6, 7, 12, 13, 14, 15, 16, 17, 18, 19, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131], ('Chinstrap', 'Dream'): [152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219], ('Gentoo', 'Biscoe'): [220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, ...]}


for key, df in double_group:
    display(df.head())


penguins.groupby(['species', 'island']).mean()


weird = penguins.groupby(['species', 'island']).mean()
weird


weird['body_mass_g']

species    island   
Adelie     Biscoe       3709.659091
           Dream        3701.363636
           Torgersen    3708.510638
Chinstrap  Dream        3733.088235
Gentoo     Biscoe       5092.436975
Name: body_mass_g, dtype: float64


weird.loc['Adelie']


weird.loc[('Adelie', 'Torgersen')]

bill_length_mm         39.038298
bill_depth_mm          18.451064
flipper_length_mm     191.531915
body_mass_g          3708.510638
Name: (Adelie, Torgersen), dtype: float64


weird.reset_index()


penguins.groupby(['species', 'island'], as_index=False).mean()


penguins.groupby(['species', 'island'])['body_mass_g'].mean()

species    island   
Adelie     Biscoe       3709.659091
           Dream        3701.363636
           Torgersen    3708.510638
Chinstrap  Dream        3733.088235
Gentoo     Biscoe       5092.436975
Name: body_mass_g, dtype: float64


penguins.pivot_table(index='species', 
                     columns='island', 
                     values='body_mass_g', 
                     aggfunc='mean')


penguins.pivot_table(index='island', 
                     columns='species', 
                     values='bill_length_mm', 
                     aggfunc='count')


penguins.pivot_table(index='island', 
                     columns='species', 
                     values='bill_length_mm', 
                     aggfunc='count').fillna(0)


penguins.pivot_table(index='island', 
                     columns='species', 
                     values='bill_length_mm', 
                     aggfunc='count', 
                     fill_value=0)


penguins.pivot_table(index='species', columns='sex', values='body_mass_g', aggfunc='mean')


counts = penguins.pivot_table(index='species', 
                              columns='sex', 
                              values='body_mass_g', 
                              aggfunc='count', 
                              fill_value=0)

counts


joint = counts / counts.sum().sum()
joint


joint


joint.sum(axis=1)

species
Adelie       0.438438
Chinstrap    0.204204
Gentoo       0.357357
dtype: float64


joint.sum(axis=0)

sex
Female    0.495495
Male      0.504505
dtype: float64


counts


counts.sum(axis=0)

sex
Female    165
Male      168
dtype: int64


counts / counts.sum(axis=0)


counts.T / counts.sum(axis=1)


moves = pd.DataFrame([
    [1, 1, 'O'],
    [2, 1, 'X'],
    [2, 2, 'X'],
    [2, 3, 'O'],
    [3, 1, 'O'],
    [3, 3, 'X']
], columns=['i', 'j', 'move'])
moves


moves.pivot(index='i', columns='j', values='move').fillna('')


(
    penguins.groupby(['species', 'sex'])[['body_mass_g']]
            .mean()
            .reset_index()
            .pivot(index='species', columns='sex', values='body_mass_g')
)


penguins.pivot_table(index='species', columns='sex', values='body_mass_g', aggfunc='mean')


lisa = pd.DataFrame([
        [20, 46],
        [18, 54],
        [5, 20]
    ],
    columns=['Units', 'Grade Points Earned'], 
    index=['Fall', 'Winter', 'Spring'])

lisa


bart = pd.DataFrame([
        [5, 10],
        [5, 13.5],
        [22, 81.4]
    ],
    columns=['Units', 'Grade Points Earned'], 
    index=['Fall', 'Winter', 'Spring'])

bart


quarterly_gpas = pd.DataFrame(
    {
        "Lisa's Quarter GPA": lisa['Grade Points Earned'] / lisa['Units'],
        "Bart's Quarter GPA": bart['Grade Points Earned'] / bart['Units']
    }
)

quarterly_gpas


tot = lisa.sum()
tot['Grade Points Earned'] / tot['Units']

2.7906976744186047


tot = bart.sum()
tot['Grade Points Earned'] / tot['Units']

3.278125


quarterly_gpas.assign(Lisa_units=lisa['Units']) \
              .assign(Bart_units=bart['Units']) \
              .iloc[:, [0, 2, 1, 3]]


show_paradox_slides()


ratings = pd.read_csv('data/ratings.csv')
ratings.sample(5).head()


ratings.pivot_table(index='phone', columns='restaurant', values='rating', aggfunc='mean')


ratings.groupby('restaurant').mean()


section_A = pd.DataFrame({
    'Name': ['Annie', 'Billy', 'Sally', 'Tommy'],
    'Midterm': [98, 82, 23, 45],
    'Final': [88, 100, 99, 67]
})

section_A


section_B = pd.DataFrame({
    'Name': ['Junior', 'Rex', 'Flash'],
    'Midterm': [70, 99, 81],
    'Final': [42, 25, 90]
})

section_B


pd.concat([section_A, section_B])


pd.concat([section_A, section_B], ignore_index=True)


combined = pd.concat([section_A, section_B], keys=['Section A', 'Section B'])
combined


combined.loc['Section A']


section_C = pd.DataFrame({
    'Name': ['Justin', 'Marina'],
    'Final': [98, 52]
})

section_C


section_D = pd.DataFrame({
    'Name': ['Janine', 'Aaron', 'Suraj'],
    'Midterm': [10, 80, 40]
})

section_D


pd.concat([section_C, section_D])


os.listdir('data')

['.DS_Store', 'ratings.csv']


os.listdir('../')

['.DS_Store',
 'lec01',
 'lec06',
 'lec08',
 'lec07',
 '.ipynb_checkpoints',
 'lec05',
 'lec02',
 'lec03',
 'lec04']


!ls ../

lec01 lec02 lec03 lec04 lec05 lec06 lec07 lec08


import datetime


datetime.datetime.now()

datetime.datetime(2022, 4, 13, 11, 3, 59, 289345)


datetime.datetime.now() + datetime.timedelta(days=3, hours=5)

datetime.datetime(2022, 4, 16, 16, 3, 59, 294369)


datetime.datetime.now().timestamp()

1649873039.298737


pd.Timestamp(year=1998, month=11, day=26)

Timestamp('1998-11-26 00:00:00')


final_start = pd.to_datetime('June 4th, 2022, 11:30AM')
final_start

Timestamp('2022-06-04 11:30:00')


final_finish = pd.to_datetime('June 4th, 2022, 2:30PM')
final_finish

Timestamp('2022-06-04 14:30:00')


final_finish.dayofweek

5


final_finish.year

2022


final_finish - final_start

Timedelta('0 days 03:00:00')


times = pd.DataFrame({'finish': pd.to_datetime(['Sun, Jan 01, 1989', 
                                                '2022-04-13T11:00', 
                                                '1/1/1970'])})
times


times.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   finish  3 non-null      datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 152.0 bytes


times.iloc[0, 0]

Timestamp('1989-01-01 00:00:00')


times.sort_values('finish')


times_A = pd.DataFrame({
    'Name': ['Annie', 'Billy', 'Sally', 'Tommy'],
    'start_exam': ['15:00', '15:02', '15:01', '15:00'],
    'finish_exam': ['16:00', '17:58', '17:05', '16:55']
})

times_B = pd.DataFrame({
    'Name': ['Junior', 'Rex', 'Flash'],
    'start_exam': ['18:00', '18:06', '19:07'],
    'finish_exam': ['20:00', '20:50', '20:59']
})

display(times_A)
display(times_B)


# Step 1
both_versions = pd.concat([times_A, times_B])
both_versions


# Step 2
both_versions = both_versions.assign(
    start_exam=pd.to_datetime(both_versions['start_exam']),
    finish_exam=pd.to_datetime(both_versions['finish_exam'])
)

both_versions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7 entries, 0 to 2
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Name         7 non-null      object        
 1   start_exam   7 non-null      datetime64[ns]
 2   finish_exam  7 non-null      datetime64[ns]
dtypes: datetime64[ns](2), object(1)
memory usage: 224.0+ bytes


# Step 3
both_versions = both_versions.assign(
    elapsed=both_versions['finish_exam'] - both_versions['start_exam']
)

both_versions


# Steps 4 and 5
both_versions.sort_values('elapsed').iloc[0].loc['Name']

'Annie'

species	Adelie	Chinstrap	Gentoo
island
Biscoe	44.0	NaN	119.0
Dream	55.0	68.0	NaN
Torgersen	47.0	NaN	NaN

species	Adelie	Chinstrap	Gentoo
island
Biscoe	44.0	0.0	119.0
Dream	55.0	68.0	0.0
Torgersen	47.0	0.0	0.0

Phone Type	Stars for Dirty Birds	Stars for The Loft
Android	4.24	4.0
iPhone	2.99	2.79
All	3.32	3.37

	phone	restaurant	rating
4200	iPhone	The Loft	2
2592	iPhone	Dirty Birds	3
2658	iPhone	Dirty Birds	3
4414	iPhone	The Loft	3
2516	iPhone	Dirty Birds	3

restaurant	Dirty Birds	The Loft
phone
Android	4.235669	4.000000
iPhone	2.987957	2.787971

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female
5	Adelie	Torgersen	39.3	20.6	190.0	3650.0	Male

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
20	Adelie	Biscoe	37.8	18.3	174.0	3400.0	Female
21	Adelie	Biscoe	37.7	18.7	180.0	3600.0	Male
22	Adelie	Biscoe	35.9	19.2	189.0	3800.0	Female
23	Adelie	Biscoe	38.2	18.1	185.0	3950.0	Male
24	Adelie	Biscoe	38.8	17.2	180.0	3800.0	Male

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
152	Chinstrap	Dream	46.5	17.9	192.0	3500.0	Female
153	Chinstrap	Dream	50.0	19.5	196.0	3900.0	Male
154	Chinstrap	Dream	51.3	19.2	193.0	3650.0	Male
155	Chinstrap	Dream	45.4	18.7	188.0	3525.0	Female
156	Chinstrap	Dream	52.7	19.8	197.0	3725.0	Male

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
220	Gentoo	Biscoe	46.1	13.2	211.0	4500.0	Female
221	Gentoo	Biscoe	50.0	16.3	230.0	5700.0	Male
222	Gentoo	Biscoe	48.7	14.1	210.0	4450.0	Female
223	Gentoo	Biscoe	50.0	15.2	218.0	5700.0	Male
224	Gentoo	Biscoe	47.6	14.5	215.0	5400.0	Male

	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g
island
Biscoe	38.975000	18.370455	188.795455	3709.659091
Dream	38.520000	18.240000	189.927273	3701.363636
Torgersen	39.038298	18.451064	191.531915	3708.510638

sex	Female	Male
species
Adelie	3368.835616	4043.493151
Chinstrap	3527.205882	3938.970588
Gentoo	4679.741379	5484.836066

sex	Female	Male
species
Adelie	0.219219	0.219219
Chinstrap	0.102102	0.102102
Gentoo	0.174174	0.183183

sex	Female	Male
species
Adelie	0.442424	0.434524
Chinstrap	0.206061	0.202381
Gentoo	0.351515	0.363095

	Name	Midterm	Final
0	Annie	98	88
1	Billy	82	100
2	Sally	23	99
3	Tommy	45	67
0	Junior	70	42
1	Rex	99	25
2	Flash	81	90

		Name	Midterm	Final
Section A	0	Annie	98	88
	1	Billy	82	100
	2	Sally	23	99
	3	Tommy	45	67
Section B	0	Junior	70	42
	1	Rex	99	25
	2	Flash	81	90

	i	j	move
0	1	1	O
1	2	1	X
2	2	2	X
3	2	3	O
4	3	1	O
5	3	3	X

	Units	Grade Points Earned
Fall	20	46
Winter	18	54
Spring	5	20

	Units	Grade Points Earned
Fall	5	10.0
Winter	5	13.5
Spring	22	81.4

	Lisa's Quarter GPA	Bart's Quarter GPA
Fall	2.3	2.0
Winter	3.0	2.7
Spring	4.0	3.7

	Name	Final	Midterm
0	Justin	98.0	NaN
1	Marina	52.0	NaN
0	Janine	NaN	10.0
1	Aaron	NaN	80.0
2	Suraj	NaN	40.0

	finish
0	1989-01-01 00:00:00
1	2022-04-13 11:00:00
2	1970-01-01 00:00:00

	Name	start_exam	finish_exam
0	Annie	15:00	16:00
1	Billy	15:02	17:58
2	Sally	15:01	17:05
3	Tommy	15:00	16:55

	Name	start_exam	finish_exam	elapsed
0	Annie	2022-04-13 15:00:00	2022-04-13 16:00:00	0 days 01:00:00
1	Billy	2022-04-13 15:02:00	2022-04-13 17:58:00	0 days 02:56:00
2	Sally	2022-04-13 15:01:00	2022-04-13 17:05:00	0 days 02:04:00
3	Tommy	2022-04-13 15:00:00	2022-04-13 16:55:00	0 days 01:55:00
0	Junior	2022-04-13 18:00:00	2022-04-13 20:00:00	0 days 02:00:00
1	Rex	2022-04-13 18:06:00	2022-04-13 20:50:00	0 days 02:44:00
2	Flash	2022-04-13 19:07:00	2022-04-13 20:59:00	0 days 01:52:00

Lecture 8 – Pivoting, Simpson's Paradox, and Concatenation¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Grouping¶

🐧

Discussion Question¶

Grouping with multiple columns¶

Grouping and indexes¶

Pivoting¶

Average body mass for every combination of species and island¶

pivot_table¶

Example¶

Example¶

Joint and conditional distributions¶

pivot_table aggregates and reshapes¶

Example: Tic-tac-toe¶

pivot_table = groupby + pivot¶

Reshaping¶

Simpson's paradox¶

Example: Grades¶

What happened?¶

Simpson's paradox¶

Example: How Berkeley was sued for gender discrimination (1973)¶

What do you notice?¶

What happened?¶

Caution!¶

But then...¶

Do these conflict?¶

Do these conflict?¶

Example: Restaurant reviews and phone types¶

Restaurant reviews and phone types¶

Verifying Simpson's paradox¶

Takeaways¶

Further reading¶

Concatenating vertically¶

Segue¶

Data spread across multiple files¶

Row-wise combination of data: pd.concat¶

Example: Grades¶

Missing columns?¶

⚠️ Warning: No loops!¶

Aside: accessing file names programmatically¶

Aside: Working with time series data¶

Time series – why now?¶

Datetime types¶

The datetime module¶

Times in pandas¶

Timestamps in DataFrames¶

Example: Exam speeds¶

Summary, next time¶

Summary¶

`pivot_table`¶

`pivot_table` aggregates and reshapes¶

`pivot_table` = `groupby` + `pivot`¶

Row-wise combination of data: `pd.concat`¶

The `datetime` module¶

Times in `pandas`¶