# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
%reload_ext pandas_tutor
%set_pandas_tutor_options {'projectorMode': True}
set_matplotlib_formats("svg")
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

from IPython.display import display, IFrame

def merging_animation():
    src="https://docs.google.com/presentation/d/e/2PACX-1vSk2FfJ4K_An_CQwcN_Yu5unpJckOZjVQDFqZ78ZTTMmowUsCQKKVnum0_m6TaiGquQ44E3FiS9g2Y4/embed?start=false&loop=false&delayms=60000&rm=minimal"
    width=825
    height=500
    display(IFrame(src, width, height))


roster = bpd.read_csv('data/roster-anon.csv')
roster


def first_name(full_name):
    '''Returns the first name given a full name.'''
    return full_name.split(' ')[0]


roster = roster.assign(
    first=roster.get('name').apply(first_name)
)
roster


name_counts = (
    roster
    .groupby('first')
    .count()
    .sort_values('name', ascending=False)
    .get(['name'])
)
name_counts


roster[roster.get('first') == 'Ryan'].groupby('section').count()


roster[roster.get('first') == 'Marie'].groupby('section').count()


roster[roster.get('first') == 'Ethan'].groupby('section').count()


# One row per unique first name.
roster.groupby('first').count().get('name')

first
Aaron       2
Aarush      1
Abhishek    1
           ..
Zihan       1
Zike        1
Zixuan      1
Name: name, Length: 250, dtype: int64


# One row per unique section.
roster.groupby('section').count().get('name')

section
12PM    139
1PM     137
Name: name, dtype: int64


roster


roster.groupby(['section', 'first']).count()


roster.groupby(['section', 'first']).count().reset_index()


roster.groupby(['section', 'first']).count().reset_index()


roster.groupby(['first', 'section']).count().reset_index()


counts = roster.groupby(['section', 'first']).count().reset_index()
counts

...

Ellipsis

...

Ellipsis


sea_temp = bpd.read_csv('data/sea_temp.csv')
sea_temp

...

Ellipsis


(sea_temp
 .groupby('MONTH') 
 .mean() 
 .plot(kind='line', y='SURFACE_TEMP')
);


(sea_temp
 .groupby('YEAR') 
 .mean() 
 .plot(kind='line', y='SURFACE_TEMP')
);


phones = bpd.DataFrame().assign(
    Model=['iPhone 13', 'iPhone 13 Pro Max', 'Samsung Galaxy Z Flip', 'Pixel 5a'],
    Price=[799, 1099, 999, 449],
    Screen=[6.1, 6.7, 6.7, 6.3]
)

inventory = bpd.DataFrame().assign(
    Handset=['iPhone 13 Pro Max', 'iPhone 13', 'Pixel 5a', 'iPhone 13'],
    Units=[50, 40, 10, 100],
    Store=['Westfield UTC', 'Westfield UTC', 'Fashion Valley', 'Downtown']
)


# Phones on the market right now.
phones


# Which phones my stores have in stock in the area.
inventory


phones.merge(inventory, left_on='Model', right_on='Handset')


# Click through the presentation that appears.
merging_animation()

left_df.merge(
  right_df, 
  left_on='left_col_name',
  right_on='right_col_name'
)


phones


inventory


# Notice there's no Samsung Galaxy Z Flip in phones_merged!
phones_merged = phones.merge(inventory, left_on='Model', right_on='Handset')
phones_merged


(phones_merged.get('Price') * phones_merged.get('Units')).sum()

171300


%%pt

phones.merge(inventory, left_on='Model', right_on='Handset')


%%pt

phones.merge(inventory, left_on='Model', right_on='Handset')


%%pt

inventory.merge(phones, left_on='Handset', right_on='Model')


phones


inventory_relabeled = inventory.assign(Model=inventory.get('Handset')).drop(columns=['Handset'])
inventory_relabeled


phones.merge(inventory_relabeled, on='Model')


phones


inventory_by_handset = inventory.set_index('Handset')
inventory_by_handset


phones.merge(inventory_by_handset, left_on='Model', right_index=True)


nice_weather_cities = bpd.DataFrame().assign(
    city=['La Jolla', 'San Diego', 'Austin', 'Los Angeles'],
    state=['California', 'California', 'Texas', 'California'],
    today_high_temp=['79', '83', '87', '87']
    
)

schools = bpd.DataFrame().assign(
    name=['UCSD', 'University of Chicago', 'University of San Diego','Johns Hopkins University', 'UT Austin', 'SDSU', 'UCLA'], 
    city=['La Jolla', 'Chicago', 'San Diego', 'Baltimore', 'Austin', 'San Diego', 'Los Angeles'],
    state=['California', 'Illinois', 'California', 'Maryland', 'Texas', 'California', 'California'],
    graduation_rate=[0.87, 0.94, 0.78, 0.92, 0.81, 0.83, 0.91 ]
)


nice_weather_cities


schools


nice_weather_cities


schools


nice_weather_cities.merge(schools, on='state')


nice_weather_cities.merge(schools, on='state').shape[0]

13

	name	section	first
0	Derrick Gernlq	1PM	Derrick
1	Tommy Vbpsht	12PM	Tommy
2	Grace Smgsmb	12PM	Grace
...	...	...	...
273	Norah Pcqynf	12PM	Norah
274	Harry Jwofgg	1PM	Harry
275	Zhe Ltynpn	1PM	Zhe

	name	section	first
0	Derrick Gernlq	1PM	Derrick
1	Tommy Vbpsht	12PM	Tommy
2	Grace Smgsmb	12PM	Grace
...	...	...	...
273	Norah Pcqynf	12PM	Norah
274	Harry Jwofgg	1PM	Harry
275	Zhe Ltynpn	1PM	Zhe

	YEAR	MONTH	DAY	SURFACE_TEMP
0	1916	8	22	19.5
1	1916	8	23	19.9
2	1916	8	24	19.7
...	...	...	...	...
37195	2021	12	29	15.0
37196	2021	12	30	15.1
37197	2021	12	31	15.4

Lecture 9 – Grouping on Multiple Columns, Merging¶

DSC 10, Spring 2023¶

Announcements¶

Agenda¶

Grouping on multiple columns¶

DSC 10 student data¶

How many students named Ryan are in each section?¶

How many students with each first name does each lecture section have?¶

Grouping on multiple columns¶

Grouping on multiple columns¶

Notice the index... 🤔¶

Does order matter?¶

Activity¶

Activity¶

Example: Sea temperatures 🌊¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Plots of monthly and yearly average surface temperature 📈¶

Summary: Grouping on multiple columns¶

Merging 🚙¶

Example: Phone sales 📱¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

What just happened!? 🤯¶

`.merge`¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

Aside: Pandas Tutor¶

Does it matter which DataFrame is the left or right DataFrame? 🤔¶

Special cases¶

What if the names of the columns we want to merge on are both the same?¶

What if we want to merge using an index instead of a column?¶

Activity setup¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Followup activity¶

More practice!¶

Summary, next time¶

Summary¶

Next time¶

	section	first	name
0	12PM	Aaron	2
1	12PM	Abhishek	1
2	12PM	Adriana	1
...	...	...	...
256	1PM	Zhe	1
257	1PM	Zifei	1
258	1PM	Zixuan	1

	first	section	name
0	Aaron	12PM	2
1	Aarush	1PM	1
2	Abhishek	12PM	1
...	...	...	...
256	Zihan	12PM	1
257	Zike	12PM	1
258	Zixuan	1PM	1

	Model	Price	Screen
0	iPhone 13	799	6.1
1	iPhone 13 Pro Max	1099	6.7
2	Samsung Galaxy Z Flip	999	6.7
3	Pixel 5a	449	6.3

	Handset	Units	Store
0	iPhone 13 Pro Max	50	Westfield UTC
1	iPhone 13	40	Westfield UTC
2	Pixel 5a	10	Fashion Valley
3	iPhone 13	100	Downtown

	Units	Store
Handset
iPhone 13 Pro Max	50	Westfield UTC
iPhone 13	40	Westfield UTC
Pixel 5a	10	Fashion Valley
iPhone 13	100	Downtown

	city	state	today_high_temp
0	La Jolla	California	79
1	San Diego	California	83
2	Austin	Texas	87
3	Los Angeles	California	87

	name	city	state	graduation_rate
0	UCSD	La Jolla	California	0.87
1	University of Chicago	Chicago	Illinois	0.94
2	University of San Diego	San Diego	California	0.78
3	Johns Hopkins University	Baltimore	Maryland	0.92
4	UT Austin	Austin	Texas	0.81
5	SDSU	San Diego	California	0.83
6	UCLA	Los Angeles	California	0.91

Lecture 9 – Grouping on Multiple Columns, Merging¶

DSC 10, Spring 2023¶

Announcements¶

Agenda¶

Grouping on multiple columns¶

DSC 10 student data¶

How many students named Ryan are in each section?¶

How many students with each first name does each lecture section have?¶

Grouping on multiple columns¶

Grouping on multiple columns¶

Notice the index... 🤔¶

Does order matter?¶

Activity¶

Activity¶

Example: Sea temperatures 🌊¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Plots of monthly and yearly average surface temperature 📈¶

Summary: Grouping on multiple columns¶

Merging 🚙¶

Example: Phone sales 📱¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

What just happened!? 🤯¶

.merge¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

Aside: Pandas Tutor¶

Does it matter which DataFrame is the left or right DataFrame? 🤔¶

Special cases¶

What if the names of the columns we want to merge on are both the same?¶

What if we want to merge using an index instead of a column?¶

Activity setup¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Followup activity¶

More practice!¶

Summary, next time¶

Summary¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

`.merge`¶

Concept Check ✅ – Answer at cc.dsc10.com ¶