# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
%reload_ext pandas_tutor
%set_pandas_tutor_options {'projectorMode': True}
set_matplotlib_formats("svg")
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

from IPython.display import display, IFrame

def merging_animation():
    src="https://docs.google.com/presentation/d/e/2PACX-1vSk2FfJ4K_An_CQwcN_Yu5unpJckOZjVQDFqZ78ZTTMmowUsCQKKVnum0_m6TaiGquQ44E3FiS9g2Y4/embed?start=false&loop=false&delayms=60000"
    width=825
    height=500
    display(IFrame(src, width, height))


roster = bpd.read_csv('data/roster-anon.csv')
roster


def first_name(full_name):
    '''Returns the first name given a full name.'''
    return full_name.split(' ')[0]


with_first = roster.assign(
    first=roster.get('name').apply(first_name)
)
with_first


first_counts = (with_first.groupby('first').count()
                .sort_values('name', ascending=False)
                .get(['name']))
first_counts


with_first[with_first.get('first') == 'Ethan'].groupby('section').count()


with_first[with_first.get('first') == 'Emily'].groupby('section').count()


with_first[with_first.get('first') == 'Yuen'].groupby('section').count()


with_first


with_first.groupby(['section', 'first']).count()


with_first[with_first.get('first') == 'Adrian']


with_first.groupby(['section', 'first']).count().reset_index()


with_first.groupby(['section', 'first']).count().reset_index()


with_first.groupby(['first', 'section']).count().reset_index()


counts = with_first.groupby(['section', 'first']).count().reset_index()
counts

...

Ellipsis


counts

...

Ellipsis


sea_temp = bpd.read_csv('data/sea_temp.csv')
sea_temp

...

Ellipsis


(sea_temp
 .groupby('MONTH') 
 .mean() 
 .plot(kind='line', y='SURFACE_TEMP')
);


(sea_temp
 .groupby('YEAR') 
 .mean() 
 .plot(kind='line', y='SURFACE_TEMP')
);


phones = bpd.DataFrame().assign(
    Model=['iPhone 13', 'iPhone 13 Pro Max', 'Samsung Galaxy Z Flip', 'Pixel 5a'],
    Price=[799, 1099, 999, 449],
    Screen=[6.1, 6.7, 6.7, 6.3]
)

inventory = bpd.DataFrame().assign(
    Handset=['iPhone 13 Pro Max', 'iPhone 13', 'Pixel 5a', 'iPhone 13'],
    Units=[50, 40, 10, 100],
    Store=['Westfield UTC', 'Westfield UTC', 'Fashion Valley', 'Downtown']
)


# Phones on the market right now
phones


# Which phones my stores have in stock in the area
inventory


phones.merge(inventory, left_on='Model', right_on='Handset')


# Click through the presentation that appears
merging_animation()


%%pt

# Notice there's no Samsung Galaxy Z Flip in phones_merged
phones_merged = phones.merge(inventory, left_on='Model', right_on='Handset')


(phones_merged.get('Price') * phones_merged.get('Units')).sum()

171300


inventory_relabeled = inventory.assign(Model=inventory.get('Handset')).drop(columns=['Handset'])
inventory_relabeled


phones.merge(inventory_relabeled, on='Model')


%%pt

inventory.merge(phones, left_on='Handset', right_on='Model')


phones


inventory_by_handset = inventory.set_index('Handset')
inventory_by_handset


phones.merge(inventory_by_handset, left_on='Model', right_index=True)


nice_weather_cities = bpd.DataFrame().assign(
    city=['La Jolla', 'San Diego', 'Austin', 'Los Angeles'],
    state=['California', 'California', 'Texas', 'California'],
    today_high_temp=['79', '83', '87', '87']
    
)

schools = bpd.DataFrame().assign(
    name=['UCSD', 'University of Chicago', 'University of San Diego','Johns Hopkins University', 'UT Austin', 'SDSU', 'UCLA'], 
    city=['La Jolla', 'Chicago', 'San Diego', 'Baltimore', 'Austin', 'San Diego', 'Los Angeles'],
    state=['California', 'Illinois', 'California', 'Maryland', 'Texas', 'California', 'California'],
    graduation_rate=[0.87, 0.94, 0.78, 0.92, 0.81, 0.83, 0.91 ]
)


nice_weather_cities


schools


%%pt

nice_weather_cities.merge(schools, on='state')


nice_weather_cities.merge(schools, on='state').shape[0]

13

	name	section
0	Levy Dmxsqj	11AM
1	Aiden Nyozzx	1PM
2	Sruti Fivolq	12PM
...	...	...
408	Leni Hlfjhh	11AM
409	Dory Xaghsk	1PM
410	Laura Xfqwzu	11AM

	name	section	first
0	Levy Dmxsqj	11AM	Levy
1	Aiden Nyozzx	1PM	Aiden
2	Sruti Fivolq	12PM	Sruti
...	...	...	...
408	Leni Hlfjhh	11AM	Leni
409	Dory Xaghsk	1PM	Dory
410	Laura Xfqwzu	11AM	Laura

	name	section	first
0	Levy Dmxsqj	11AM	Levy
1	Aiden Nyozzx	1PM	Aiden
2	Sruti Fivolq	12PM	Sruti
...	...	...	...
408	Leni Hlfjhh	11AM	Leni
409	Dory Xaghsk	1PM	Dory
410	Laura Xfqwzu	11AM	Laura

	name	section	first
37	Adrian Yombcy	10AM	Adrian
332	Adrian Rxppvf	11AM	Adrian

	section	first	name
0	10AM	Adrian	1
1	10AM	Ahmed	1
2	10AM	Akshay	1
...	...	...	...
390	1PM	Zilu	1
391	1PM	Ziwei	1
392	1PM	Ziyu	1

Lecture 10 – Grouping with Subgroups, Merging¶

DSC 10, Fall 2022¶

Announcements¶

Agenda¶

Grouping with subgroups¶

DSC 10 student data¶

How many students named `'Ethan'` are in each section?¶

How many students with each first name does each lecture section have?¶

`.groupby` with subgroups¶

Notice the index... 🤔¶

Does order matter?¶

Activity¶

Activity¶

New dataset: Sea temperatures 🌊¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Plots of monthly and yearly average surface temperature 📈¶

Summary: `.groupby` with subgroups¶

Merging 🚗¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

What just happened!? 🤯¶

`.merge`¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

Shortcut if column names are the same: `on`¶

Does order matter? 🤔¶

What if we want to "merge on" an index?¶

Activity setup¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Followup activity¶

Summary, next time¶

Summary¶

Next time¶

	first	section	name
0	Aaron	11AM	1
1	Abdulrahim	11AM	1
2	Abigail	11AM	1
...	...	...	...
390	Ziyu	1PM	1
391	Zoey	11AM	1
392	Zoya	11AM	1

	YEAR	MONTH	DAY	SURFACE_TEMP
0	1916	8	22	19.5
1	1916	8	23	19.9
2	1916	8	24	19.7
...	...	...	...	...
36839	2020	12	29	14.9
36840	2020	12	30	15.0
36841	2020	12	31	14.8

	Model	Price	Screen
0	iPhone 13	799	6.1
1	iPhone 13 Pro Max	1099	6.7
2	Samsung Galaxy Z Flip	999	6.7
3	Pixel 5a	449	6.3

	Handset	Units	Store
0	iPhone 13 Pro Max	50	Westfield UTC
1	iPhone 13	40	Westfield UTC
2	Pixel 5a	10	Fashion Valley
3	iPhone 13	100	Downtown

	Units	Store
Handset
iPhone 13 Pro Max	50	Westfield UTC
iPhone 13	40	Westfield UTC
Pixel 5a	10	Fashion Valley
iPhone 13	100	Downtown

	city	state	today_high_temp
0	La Jolla	California	79
1	San Diego	California	83
2	Austin	Texas	87
3	Los Angeles	California	87

	name	city	state	graduation_rate
0	UCSD	La Jolla	California	0.87
1	University of Chicago	Chicago	Illinois	0.94
2	University of San Diego	San Diego	California	0.78
3	Johns Hopkins University	Baltimore	Maryland	0.92
4	UT Austin	Austin	Texas	0.81
5	SDSU	San Diego	California	0.83
6	UCLA	Los Angeles	California	0.91

	name
first
Ethan	5
Steven	4
Jason	4
...	...
Huanchang	1
Housheng	1
Zoya	1

Lecture 10 – Grouping with Subgroups, Merging¶

DSC 10, Fall 2022¶

Announcements¶

Agenda¶

Grouping with subgroups¶

DSC 10 student data¶

How many students named 'Ethan' are in each section?¶

How many students with each first name does each lecture section have?¶

.groupby with subgroups¶

Notice the index... 🤔¶

Does order matter?¶

Activity¶

Activity¶

New dataset: Sea temperatures 🌊¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Plots of monthly and yearly average surface temperature 📈¶

Summary: .groupby with subgroups¶

Merging 🚗¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

What just happened!? 🤯¶

.merge¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

Shortcut if column names are the same: on¶

Does order matter? 🤔¶

What if we want to "merge on" an index?¶

Activity setup¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Followup activity¶

Summary, next time¶

Summary¶

Next time¶

How many students named `'Ethan'` are in each section?¶

`.groupby` with subgroups¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Summary: `.groupby` with subgroups¶

`.merge`¶

Shortcut if column names are the same: `on`¶

Concept Check ✅ – Answer at cc.dsc10.com ¶