# Run this cell to set up packages for lecture.
from lec09_imports import *


roster = bpd.read_csv('data/roster-anon.csv')
roster


def first_name(full_name):
    '''Returns the first name given a full name.'''
    return full_name.split(' ')[0]


roster = roster.assign(
    first=roster.get('name').apply(first_name)
)
roster


name_counts = (
    roster
    .groupby('first')
    .count()
    .sort_values('name', ascending=False)
    .get(['name'])
)
name_counts


roster[roster.get('first') == 'Ryan'].groupby('section').count()


roster[roster.get('first') == 'Matthew'].groupby('section').count()


roster[roster.get('first') == 'Alice'].groupby('section').count()


# One row per unique first name.
roster.groupby('first').count().get('name')

first
Aadya     1
Aanya     1
Aaron     1
         ..
Zifan     1
Zilin     1
Ziling    1
Name: name, Length: 232, dtype: int64


# One row per unique section.
roster.groupby('section').count().get('name')

section
10AM    106
11AM     79
9AM      68
Name: name, dtype: int64


roster


roster.groupby(['section', 'first']).count()


roster.groupby(['section', 'first']).count().reset_index()


roster.groupby(['section', 'first']).count().reset_index()


roster.groupby(['first', 'section']).count().reset_index()


counts = roster.groupby(['section', 'first']).count().reset_index()
counts

...

Ellipsis

...

Ellipsis


sea_temp = bpd.read_csv('data/sea_temp.csv')
sea_temp

...

Ellipsis


(sea_temp
 .groupby('MONTH') 
 .mean() 
 .plot(kind='line', y='SURFACE_TEMP')
);


# Why is there a sudden drop at the end? Look at the dates of data collection!
(sea_temp
 .groupby('YEAR') 
 .mean() 
 .plot(kind='line', y='SURFACE_TEMP')
);


phones = bpd.DataFrame().assign(
    Model=['iPhone 13', 'iPhone 13 Pro Max', 'Samsung Galaxy Z Flip', 'Pixel 5a'],
    Price=[799, 1099, 999, 449],
    Screen=[6.1, 6.7, 6.7, 6.3]
)

inventory = bpd.DataFrame().assign(
    Handset=['iPhone 13 Pro Max', 'iPhone 13', 'Pixel 5a', 'iPhone 13'],
    Units=[50, 40, 10, 100],
    Store=['Westfield UTC', 'Westfield UTC', 'Fashion Valley', 'Downtown']
)


# Phones on the market right now.
phones


# Which phones my stores have in stock in the area.
inventory


phones.merge(inventory, left_on='Model', right_on='Handset')


# Click through the presentation that appears.
merging_animation()

left_df.merge(
  right_df, 
  left_on='left_col_name',
  right_on='right_col_name'
)


phones


inventory


# Notice there's no Samsung Galaxy Z Flip in phones_merged!
phones_merged = phones.merge(inventory, left_on='Model', right_on='Handset')
phones_merged


(phones_merged.get('Price') * phones_merged.get('Units')).sum()

171300


phones.merge(inventory, left_on='Model', right_on='Handset')


inventory.merge(phones, left_on='Handset', right_on='Model')


phones


inventory_relabeled = inventory.assign(Model=inventory.get('Handset')).drop(columns=['Handset'])
inventory_relabeled


phones.merge(inventory_relabeled, on='Model')


phones


inventory_by_handset = inventory.set_index('Handset')
inventory_by_handset


phones.merge(inventory_by_handset, left_on='Model', right_index=True)


nice_weather_cities = bpd.DataFrame().assign(
    city=['La Jolla', 'San Diego', 'Austin', 'Los Angeles'],
    state=['California', 'California', 'Texas', 'California'],
    today_high_temp=['79', '83', '87', '87']
    
)

schools = bpd.DataFrame().assign(
    name=['UCSD', 'University of Chicago', 'University of San Diego','Johns Hopkins University', 'UT Austin', 'SDSU', 'UCLA'], 
    city=['La Jolla', 'Chicago', 'San Diego', 'Baltimore', 'Austin', 'San Diego', 'Los Angeles'],
    state=['California', 'Illinois', 'California', 'Maryland', 'Texas', 'California', 'California'],
    graduation_rate=[0.87, 0.94, 0.78, 0.92, 0.81, 0.83, 0.91 ]
)


nice_weather_cities


schools


nice_weather_cities


schools


nice_weather_cities.merge(schools, on='state')


nice_weather_cities.merge(schools, on='state').shape[0]

13

	name	section	first
0	Jolette Obtwuz	9AM	Jolette
1	Ian Lmuqpm	11AM	Ian
2	Nicole Wpedyy	10AM	Nicole
...	...	...	...
250	Genevieve Cibjer	9AM	Genevieve
251	Devon Gncdxq	11AM	Devon
252	Allyson Hknnwt	9AM	Allyson

	name	section	first
0	Jolette Obtwuz	9AM	Jolette
1	Ian Lmuqpm	11AM	Ian
2	Nicole Wpedyy	10AM	Nicole
...	...	...	...
250	Genevieve Cibjer	9AM	Genevieve
251	Devon Gncdxq	11AM	Devon
252	Allyson Hknnwt	9AM	Allyson

	YEAR	MONTH	DAY	SURFACE_TEMP
0	1916	8	22	19.5
1	1916	8	23	19.9
2	1916	8	24	19.7
...	...	...	...	...
37738	2023	6	28	19.7
37739	2023	6	29	19.3
37740	2023	6	30	20.6

Lecture 9 – Grouping on Multiple Columns, Merging¶

DSC 10, Winter 2024¶

Announcements¶

Agenda¶

Grouping on multiple columns¶

DSC 10 student data¶

How many students named Ryan are in each section?¶

How many students with each first name does each lecture section have?¶

Grouping on multiple columns¶

Grouping on multiple columns¶

Notice the index... 🤔¶

Does order matter?¶

Activity¶

Activity¶

Example: Sea temperatures 🌊¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Plots of monthly and yearly average surface temperature 📈¶

Summary: Grouping on multiple columns¶

Merging 🚙¶

Example: Phone sales 📱¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

What just happened!? 🤯¶

`.merge`¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

Does it matter which DataFrame is the left or right DataFrame? 🤔¶

Special cases¶

What if the names of the columns we want to merge on are both the same?¶

What if we want to merge using an index instead of a column?¶

Activity setup¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Followup activity¶

More practice!¶

Summary, next time¶

Summary¶

Next time¶

	section	first	name
0	10AM	Aadya	1
1	10AM	Aanya	1
2	10AM	Alice	1
...	...	...	...
242	9AM	Zhencheng	1
243	9AM	Zilin	1
244	9AM	Ziling	1

	Model	Price	Screen
0	iPhone 13	799	6.1
1	iPhone 13 Pro Max	1099	6.7
2	Samsung Galaxy Z Flip	999	6.7
3	Pixel 5a	449	6.3

	Handset	Units	Store
0	iPhone 13 Pro Max	50	Westfield UTC
1	iPhone 13	40	Westfield UTC
2	Pixel 5a	10	Fashion Valley
3	iPhone 13	100	Downtown

	Units	Store
Handset
iPhone 13 Pro Max	50	Westfield UTC
iPhone 13	40	Westfield UTC
Pixel 5a	10	Fashion Valley
iPhone 13	100	Downtown

	city	state	today_high_temp
0	La Jolla	California	79
1	San Diego	California	83
2	Austin	Texas	87
3	Los Angeles	California	87

	name	city	state	graduation_rate
0	UCSD	La Jolla	California	0.87
1	University of Chicago	Chicago	Illinois	0.94
2	University of San Diego	San Diego	California	0.78
3	Johns Hopkins University	Baltimore	Maryland	0.92
4	UT Austin	Austin	Texas	0.81
5	SDSU	San Diego	California	0.83
6	UCLA	Los Angeles	California	0.91

Lecture 9 – Grouping on Multiple Columns, Merging¶

DSC 10, Winter 2024¶

Announcements¶

Agenda¶

Grouping on multiple columns¶

DSC 10 student data¶

How many students named Ryan are in each section?¶

How many students with each first name does each lecture section have?¶

Grouping on multiple columns¶

Grouping on multiple columns¶

Notice the index... 🤔¶

Does order matter?¶

Activity¶

Activity¶

Example: Sea temperatures 🌊¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Plots of monthly and yearly average surface temperature 📈¶

Summary: Grouping on multiple columns¶

Merging 🚙¶

Example: Phone sales 📱¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

What just happened!? 🤯¶

.merge¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

Does it matter which DataFrame is the left or right DataFrame? 🤔¶

Special cases¶

What if the names of the columns we want to merge on are both the same?¶

What if we want to merge using an index instead of a column?¶

Activity setup¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Followup activity¶

More practice!¶

Summary, next time¶

Summary¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

`.merge`¶

Concept Check ✅ – Answer at cc.dsc10.com ¶