# Run this cell to set up packages for lecture.
from lec09_imports import *


roster = bpd.read_csv('data/roster-anon.csv')
roster


def first_name(full_name):
    '''Returns the first name given a full name.'''
    return full_name.split(' ')[0]


roster = roster.assign(
    first=roster.get('name').apply(first_name)
)
roster


name_counts = (
    roster
    .groupby('first')
    .count()
    .sort_values('name', ascending=False)
    .get(['name'])
)
name_counts


roster[roster.get('first') == 'Kevin'].groupby('section').count()


roster[roster.get('first') == 'Rishabh'].groupby('section').count()


roster[roster.get('first') == 'Rafi'].groupby('section').count()


# One row per unique first name.
roster.groupby('first').count().get('name')

first
Aadya     1
Aanya     1
Aarav     1
         ..
Yuxun     1
Zach      1
Zixuan    1
Name: name, Length: 200, dtype: int64


# One row per unique section.
roster.groupby('section').count().get('name')

section
10AM    82
11AM    93
9AM     47
Name: name, dtype: int64


roster


roster.groupby(['section', 'first']).count()


roster.groupby(['section', 'first']).count().reset_index()


roster.groupby(['section', 'first']).count().reset_index()


roster.groupby(['first', 'section']).count().reset_index()


counts = roster.groupby(['section', 'first']).count().reset_index()
counts

...

Ellipsis

...

Ellipsis


sea_temp = bpd.read_csv('data/sea_temp.csv')
sea_temp

...

Ellipsis


(sea_temp
 .groupby('MONTH') 
 .mean() 
 .plot(kind='line', y='SURFACE_TEMP')
);


# Why is there a sudden drop at the end? Look at the dates of data collection!
(sea_temp
 .groupby('YEAR') 
 .mean() 
 .plot(kind='line', y='SURFACE_TEMP')
);


offer_percentage = bpd.DataFrame().assign(
    clothing_type=['Shirt', 'Pants', 'Dress', 'Shorts', 'Shoes'],
    offer_percentage=[20, 30, 50, 30, 50]
)

clothes = bpd.DataFrame().assign(
    item=['Dress', 'Shirt', 'Shoes', 'Pants', 'Shoes'],
    retail_price=[150, 30, 90, 50, 70]
)


# The percentage of retail price that I can earn for reselling my clothes.
offer_percentage


# The items I want to sell and their retail prices.
clothes


clothes_merged = offer_percentage.merge(clothes, left_on='clothing_type', right_on='item')
clothes_merged


# Click through the presentation that appears.
merging_animation()

left_df.merge(
  right_df, 
  left_on='left_col_name',
  right_on='right_col_name'
)


clothes_merged = offer_percentage.merge(clothes, left_on='clothing_type', right_on='item')
clothes_merged


# If I sell all of the clothes in my collection, how much will I earn?
(clothes_merged.get('offer_percentage') / 100 * clothes_merged.get('retail_price')).sum()

176.0


offer_percentage.merge(clothes, left_on='clothing_type', right_on='item')


clothes.merge(offer_percentage, left_on='item', right_on='clothing_type')


offer_percentage


clothes_relabeled = clothes.assign(clothing_type=clothes.get('item')).drop(columns=['item'])
clothes_relabeled


offer_percentage.merge(clothes_relabeled, on='clothing_type')


offers_by_item = offer_percentage.set_index('clothing_type')
offers_by_item


clothes


offers_by_item.merge(clothes, left_index=True, right_on='item')


nice_weather_cities = bpd.DataFrame().assign(
    city=['La Jolla', 'San Diego', 'Austin', 'Los Angeles'],
    state=['California', 'California', 'Texas', 'California'],
    today_high_temp=['79', '83', '87', '87']
    
)

schools = bpd.DataFrame().assign(
    name=['UCSD', 'University of Chicago', 'University of San Diego','Johns Hopkins University', 'UT Austin', 'SDSU', 'UCLA'], 
    city=['La Jolla', 'Chicago', 'San Diego', 'Baltimore', 'Austin', 'San Diego', 'Los Angeles'],
    state=['California', 'Illinois', 'California', 'Maryland', 'Texas', 'California', 'California'],
    graduation_rate=[0.87, 0.94, 0.78, 0.92, 0.81, 0.83, 0.91 ]
)


nice_weather_cities


schools


nice_weather_cities


schools


nice_weather_cities.merge(schools, on='state')


nice_weather_cities.merge(schools, on='state').shape[0]

13

	name	section	first
0	Allie Sazhma	11AM	Allie
1	Amina Igxazd	10AM	Amina
2	Jazmine Enesxr	9AM	Jazmine
...	...	...	...
219	Ismayl Gwuiij	10AM	Ismayl
220	Neil Dkaqgm	10AM	Neil
221	Maggie Ldfgau	9AM	Maggie

	name	section	first
0	Allie Sazhma	11AM	Allie
1	Amina Igxazd	10AM	Amina
2	Jazmine Enesxr	9AM	Jazmine
...	...	...	...
219	Ismayl Gwuiij	10AM	Ismayl
220	Neil Dkaqgm	10AM	Neil
221	Maggie Ldfgau	9AM	Maggie

	YEAR	MONTH	DAY	SURFACE_TEMP
0	1916	8	22	19.5
1	1916	8	23	19.9
2	1916	8	24	19.7
...	...	...	...	...
37738	2023	6	28	19.7
37739	2023	6	29	19.3
37740	2023	6	30	20.6

Lecture 9 – Grouping on Multiple Columns, Merging¶

DSC 10, Spring 2024¶

Announcements¶

Agenda¶

Grouping on multiple columns¶

DSC 10 student data¶

How many students named Kevin are in each section?¶

How many students with each first name does each lecture section have?¶

Grouping on multiple columns¶

Grouping on multiple columns¶

Notice the index... 🤔¶

Does order matter?¶

Activity¶

Activity¶

Example: Sea temperatures 🌊¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Plots of monthly and yearly average surface temperature 📈¶

Summary: Grouping on multiple columns¶

Merging 🚙¶

Example: Clothing Resale 👕¶

If I sell all of the clothes in my collection, how much will I earn?¶

What just happened!? 🤯¶

`.merge`¶

If I sell all of the clothes in my collection, how much will I earn?¶

Does it matter which DataFrame is the left or right DataFrame? 🤔¶

Special cases¶

What if the names of the columns we want to merge on are both the same?¶

What if we want to merge using an index instead of a column?¶

Activity setup¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Followup activity¶

More practice!¶

Summary, next time¶

Summary¶

Next time¶

	section	first	name
0	10AM	Aadya	1
1	10AM	Abigail	1
2	10AM	Achintya	1
...	...	...	...
207	9AM	Yisi	1
208	9AM	Yujie	1
209	9AM	Zixuan	1

	first	section	name
0	Aadya	10AM	1
1	Aanya	11AM	1
2	Aarav	11AM	1
...	...	...	...
207	Yuxun	10AM	1
208	Zach	11AM	1
209	Zixuan	9AM	1

	clothing_type	offer_percentage	item	retail_price
0	Shirt	20	Shirt	30
1	Pants	30	Pants	50
2	Dress	50	Dress	150
3	Shoes	50	Shoes	90
4	Shoes	50	Shoes	70

	city	state	today_high_temp
0	La Jolla	California	79
1	San Diego	California	83
2	Austin	Texas	87
3	Los Angeles	California	87

	name	city	state	graduation_rate
0	UCSD	La Jolla	California	0.87
1	University of Chicago	Chicago	Illinois	0.94
2	University of San Diego	San Diego	California	0.78
3	Johns Hopkins University	Baltimore	Maryland	0.92
4	UT Austin	Austin	Texas	0.81
5	SDSU	San Diego	California	0.83
6	UCLA	Los Angeles	California	0.91

Lecture 9 – Grouping on Multiple Columns, Merging¶

DSC 10, Spring 2024¶

Announcements¶

Agenda¶

Grouping on multiple columns¶

DSC 10 student data¶

How many students named Kevin are in each section?¶

How many students with each first name does each lecture section have?¶

Grouping on multiple columns¶

Grouping on multiple columns¶

Notice the index... 🤔¶

Does order matter?¶

Activity¶

Activity¶

Example: Sea temperatures 🌊¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Plots of monthly and yearly average surface temperature 📈¶

Summary: Grouping on multiple columns¶

Merging 🚙¶

Example: Clothing Resale 👕¶

If I sell all of the clothes in my collection, how much will I earn?¶

What just happened!? 🤯¶

.merge¶

If I sell all of the clothes in my collection, how much will I earn?¶

Does it matter which DataFrame is the left or right DataFrame? 🤔¶

Special cases¶

What if the names of the columns we want to merge on are both the same?¶

What if we want to merge using an index instead of a column?¶

Activity setup¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Followup activity¶

More practice!¶

Summary, next time¶

Summary¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

`.merge`¶

Concept Check ✅ – Answer at cc.dsc10.com ¶