# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

from IPython.display import display, IFrame

def merging_animation():
    src="https://docs.google.com/presentation/d/e/2PACX-1vSk2FfJ4K_An_CQwcN_Yu5unpJckOZjVQDFqZ78ZTTMmowUsCQKKVnum0_m6TaiGquQ44E3FiS9g2Y4/embed?start=false&loop=false&delayms=60000&rm=minimal"
    width=825
    height=500
    display(IFrame(src, width, height))


roster = bpd.read_csv('data/roster-anon.csv')
roster


def first_name(full_name):
    '''Returns the first name given a full name.'''
    return full_name.split(' ')[0]


roster = roster.assign(
    first=roster.get('name').apply(first_name)
)
roster


name_counts = (
    roster
    .groupby('first')
    .count()
    .sort_values('name', ascending=False)
    .get(['name'])
)
name_counts


roster[roster.get('first') == 'Kevin'].groupby('section').count()


roster[roster.get('first') == 'Victoria'].groupby('section').count()


roster[roster.get('first') == 'Ethan'].groupby('section').count()


# One row per unique first name.
roster.groupby('first').count().get('name')

first
Aadit      1
Aaron      1
Aarshia    1
          ..
Zhijie     1
Ziyan      1
Zubin      1
Name: name, Length: 438, dtype: int64


# One row per unique section.
roster.groupby('section').count().get('name')

section
10AM    150
1PM     150
8AM      75
9AM     150
Name: name, dtype: int64


roster


roster.groupby(['section', 'first']).count()


roster.groupby(['section', 'first']).count().reset_index()


roster.groupby(['section', 'first']).count().reset_index()


roster.groupby(['first', 'section']).count().reset_index()


counts = roster.groupby(['section', 'first']).count().reset_index()
counts

...

Ellipsis

...

Ellipsis


sea_temp = bpd.read_csv('data/sea_temp.csv')
sea_temp

...

Ellipsis


(sea_temp
 .groupby('MONTH') 
 .mean() 
 .plot(kind='line', y='SURFACE_TEMP')
);


# Why is there a sudden drop at the end? Look at the dates of data collection!
(sea_temp
 .groupby('YEAR') 
 .mean() 
 .plot(kind='line', y='SURFACE_TEMP')
);


phones = bpd.DataFrame().assign(
    Model=['iPhone 13', 'iPhone 13 Pro Max', 'Samsung Galaxy Z Flip', 'Pixel 5a'],
    Price=[799, 1099, 999, 449],
    Screen=[6.1, 6.7, 6.7, 6.3]
)

inventory = bpd.DataFrame().assign(
    Handset=['iPhone 13 Pro Max', 'iPhone 13', 'Pixel 5a', 'iPhone 13'],
    Units=[50, 40, 10, 100],
    Store=['Westfield UTC', 'Westfield UTC', 'Fashion Valley', 'Downtown']
)


# Phones on the market right now.
phones


# Which phones my stores have in stock in the area.
inventory


phones.merge(inventory, left_on='Model', right_on='Handset')


# Click through the presentation that appears.
merging_animation()

left_df.merge(
  right_df, 
  left_on='left_col_name',
  right_on='right_col_name'
)


phones


inventory


# Notice there's no Samsung Galaxy Z Flip in phones_merged!
phones_merged = phones.merge(inventory, left_on='Model', right_on='Handset')
phones_merged


(phones_merged.get('Price') * phones_merged.get('Units')).sum()

171300


phones.merge(inventory, left_on='Model', right_on='Handset')


inventory.merge(phones, left_on='Handset', right_on='Model')


phones


inventory_relabeled = inventory.assign(Model=inventory.get('Handset')).drop(columns=['Handset'])
inventory_relabeled


phones.merge(inventory_relabeled, on='Model')


phones


inventory_by_handset = inventory.set_index('Handset')
inventory_by_handset


phones.merge(inventory_by_handset, left_on='Model', right_index=True)


nice_weather_cities = bpd.DataFrame().assign(
    city=['La Jolla', 'San Diego', 'Austin', 'Los Angeles'],
    state=['California', 'California', 'Texas', 'California'],
    today_high_temp=['79', '83', '87', '87']
    
)

schools = bpd.DataFrame().assign(
    name=['UCSD', 'University of Chicago', 'University of San Diego','Johns Hopkins University', 'UT Austin', 'SDSU', 'UCLA'], 
    city=['La Jolla', 'Chicago', 'San Diego', 'Baltimore', 'Austin', 'San Diego', 'Los Angeles'],
    state=['California', 'Illinois', 'California', 'Maryland', 'Texas', 'California', 'California'],
    graduation_rate=[0.87, 0.94, 0.78, 0.92, 0.81, 0.83, 0.91 ]
)


nice_weather_cities


schools


nice_weather_cities


schools


nice_weather_cities.merge(schools, on='state')


nice_weather_cities.merge(schools, on='state').shape[0]

13

	YEAR	MONTH	DAY	SURFACE_TEMP
0	1916	8	22	19.5
1	1916	8	23	19.9
2	1916	8	24	19.7
...	...	...	...	...
37648	2023	3	29	14.4
37649	2023	3	30	14.8
37650	2023	3	31	15.1

Lecture 9 – Grouping on Multiple Columns, Merging¶

DSC 10, Fall 2023¶

Announcements¶

Agenda¶

Grouping on multiple columns¶

DSC 10 student data¶

How many students named Kevin are in each section?¶

How many students with each first name does each lecture section have?¶

Grouping on multiple columns¶

Grouping on multiple columns¶

Notice the index... 🤔¶

Does order matter?¶

Activity¶

Activity¶

Example: Sea temperatures 🌊¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Plots of monthly and yearly average surface temperature 📈¶

Summary: Grouping on multiple columns¶

Merging 🚙¶

Example: Phone sales 📱¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

What just happened!? 🤯¶

`.merge`¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

Does it matter which DataFrame is the left or right DataFrame? 🤔¶

Special cases¶

What if the names of the columns we want to merge on are both the same?¶

What if we want to merge using an index instead of a column?¶

Activity setup¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Followup activity¶

More practice!¶

Summary, next time¶

Summary¶

Next time¶

	name	section
0	Kavya Fquroe	10AM
1	Victoria Yppmzx	10AM
2	An-Chi Tmbqlr	8AM
...	...	...
522	Mehri Osrvjq	9AM
523	Noah Byphhr	9AM
524	Emily Hchqii	9AM

	name	section	first
0	Kavya Fquroe	10AM	Kavya
1	Victoria Yppmzx	10AM	Victoria
2	An-Chi Tmbqlr	8AM	An-Chi
...	...	...	...
522	Mehri Osrvjq	9AM	Mehri
523	Noah Byphhr	9AM	Noah
524	Emily Hchqii	9AM	Emily

		name
section	first
10AM	Aadit	1
	Aiden	1
	Akshay	1
...	...	...
9AM	Yun	1
	Zhaocheng	1
	Ziyan	1

	first	section	name
0	Aadit	10AM	1
1	Aaron	8AM	1
2	Aarshia	1PM	1
...	...	...	...
494	Zhijie	1PM	1
495	Ziyan	9AM	1
496	Zubin	10AM	1

	Model	Price	Screen
0	iPhone 13	799	6.1
1	iPhone 13 Pro Max	1099	6.7
2	Samsung Galaxy Z Flip	999	6.7
3	Pixel 5a	449	6.3

	Handset	Units	Store
0	iPhone 13 Pro Max	50	Westfield UTC
1	iPhone 13	40	Westfield UTC
2	Pixel 5a	10	Fashion Valley
3	iPhone 13	100	Downtown

	Units	Store
Handset
iPhone 13 Pro Max	50	Westfield UTC
iPhone 13	40	Westfield UTC
Pixel 5a	10	Fashion Valley
iPhone 13	100	Downtown

	city	state	today_high_temp
0	La Jolla	California	79
1	San Diego	California	83
2	Austin	Texas	87
3	Los Angeles	California	87

	name	city	state	graduation_rate
0	UCSD	La Jolla	California	0.87
1	University of Chicago	Chicago	Illinois	0.94
2	University of San Diego	San Diego	California	0.78
3	Johns Hopkins University	Baltimore	Maryland	0.92
4	UT Austin	Austin	Texas	0.81
5	SDSU	San Diego	California	0.83
6	UCLA	Los Angeles	California	0.91

Lecture 9 – Grouping on Multiple Columns, Merging¶

DSC 10, Fall 2023¶

Announcements¶

Agenda¶

Grouping on multiple columns¶

DSC 10 student data¶

How many students named Kevin are in each section?¶

How many students with each first name does each lecture section have?¶

Grouping on multiple columns¶

Grouping on multiple columns¶

Notice the index... 🤔¶

Does order matter?¶

Activity¶

Activity¶

Example: Sea temperatures 🌊¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Plots of monthly and yearly average surface temperature 📈¶

Summary: Grouping on multiple columns¶

Merging 🚙¶

Example: Phone sales 📱¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

What just happened!? 🤯¶

.merge¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

Does it matter which DataFrame is the left or right DataFrame? 🤔¶

Special cases¶

What if the names of the columns we want to merge on are both the same?¶

What if we want to merge using an index instead of a column?¶

Activity setup¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Followup activity¶

More practice!¶

Summary, next time¶

Summary¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

`.merge`¶

Concept Check ✅ – Answer at cc.dsc10.com ¶