# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
%reload_ext pandas_tutor
%set_pandas_tutor_options {'projectorMode': True}
set_matplotlib_formats("svg")
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

from IPython.display import display, IFrame

def merging_animation():
    src="https://docs.google.com/presentation/d/e/2PACX-1vSk2FfJ4K_An_CQwcN_Yu5unpJckOZjVQDFqZ78ZTTMmowUsCQKKVnum0_m6TaiGquQ44E3FiS9g2Y4/embed?start=false&loop=false&delayms=60000"
    width=825
    height=500
    display(IFrame(src, width, height))


def cut_numerical(cut):
    if cut == 'Ideal':
        return 5
    if cut == 'Premium':
        return 4
    if cut == 'Very Good':
        return 3
    if cut == 'Good':
        return 2
    if cut == 'Fair':
        return 1


roster = bpd.read_csv('data/roster-anon.csv')
roster


def first_name(full_name):
    '''Returns the first name given a full name.'''
    return full_name.split(' ')[0]


roster = roster.assign(
    first=roster.get('name').apply(first_name)
)
roster


first_counts = (roster.groupby('first').count()
                .sort_values('name', ascending=False)
                .get(['name']))


roster[roster.get('first') == 'Ryan'].groupby('section').count()


roster[roster.get('first') == 'Joseph'].groupby('section').count()


roster[roster.get('first') == 'Janice'].groupby('section').count()


roster


roster.groupby(['section', 'first']).count()


roster.groupby(['section', 'first']).count().reset_index()


roster.groupby(['section', 'first']).count().reset_index()


roster.groupby(['first', 'section']).count().reset_index()


counts = roster.groupby(['section', 'first']).count().reset_index()
counts

...

Ellipsis

...

Ellipsis


sea_temp = bpd.read_csv('data/sea_temp.csv')
sea_temp

...

Ellipsis


(sea_temp
 .groupby('MONTH') 
 .mean() 
 .plot(kind='line', y='SURFACE_TEMP')
);


(sea_temp
 .groupby('YEAR') 
 .mean() 
 .plot(kind='line', y='SURFACE_TEMP')
);


phones = bpd.DataFrame().assign(
    Model=['iPhone 13', 'iPhone 13 Pro Max', 'Samsung Galaxy Z Flip', 'Pixel 5a'],
    Price=[799, 1099, 999, 449],
    Screen=[6.1, 6.7, 6.7, 6.3]
)

inventory = bpd.DataFrame().assign(
    Handset=['iPhone 13 Pro Max', 'iPhone 13', 'Pixel 5a', 'iPhone 13'],
    Units=[50, 40, 10, 100],
    Store=['Westfield UTC', 'Westfield UTC', 'Fashion Valley', 'Downtown']
)


# Phones on the market right now
phones


# Which phones my stores have in stock in the area
inventory


phones.merge(inventory, left_on='Model', right_on='Handset')


# Click through the presentation that appears
merging_animation()

left_df.merge(
  right_df, 
  left_on=left_column_name,
  right_on=right_column_name
)


%%pt

# Notice there's no Samsung Galaxy Z Flip in phones_merged
phones_merged = phones.merge(inventory, left_on='Model', right_on='Handset')


(phones_merged.get('Price') * phones_merged.get('Units')).sum()

171300


inventory_relabeled = inventory.assign(Model=inventory.get('Handset')).drop(columns=['Handset'])
inventory_relabeled


phones.merge(inventory_relabeled, on='Model')


%%pt

inventory.merge(phones, left_on='Handset', right_on='Model')


phones


inventory_by_handset = inventory.set_index('Handset')
inventory_by_handset


phones.merge(inventory_by_handset, left_on='Model', right_index=True)


nice_weather_cities = bpd.DataFrame().assign(
    city=['La Jolla', 'San Diego', 'Austin', 'Los Angeles'],
    state=['California', 'California', 'Texas', 'California'],
    today_high_temp=['79', '83', '87', '87']
    
)

schools = bpd.DataFrame().assign(
    name=['UCSD', 'University of Chicago', 'University of San Diego','Johns Hopkins University', 'UT Austin', 'SDSU', 'UCLA'], 
    city=['La Jolla', 'Chicago', 'San Diego', 'Baltimore', 'Austin', 'San Diego', 'Los Angeles'],
    state=['California', 'Illinois', 'California', 'Maryland', 'Texas', 'California', 'California'],
    graduation_rate=[0.87, 0.94, 0.78, 0.92, 0.81, 0.83, 0.91 ]
)


nice_weather_cities


schools


%%pt

nice_weather_cities.merge(schools, on='state')


nice_weather_cities.merge(schools, on='state').shape[0]

13

	name	section
0	Anya Iatypd	10AM
1	Nathaniel Kcyrfu	11AM
2	Jae Oadpmw	10AM
...	...	...
347	Danny Zsoyxb	10AM
348	Alex Lrmwwt	11AM
349	Giovanni Ibkdsu	11AM

	name	section	first
0	Anya Iatypd	10AM	Anya
1	Nathaniel Kcyrfu	11AM	Nathaniel
2	Jae Oadpmw	10AM	Jae
...	...	...	...
347	Danny Zsoyxb	10AM	Danny
348	Alex Lrmwwt	11AM	Alex
349	Giovanni Ibkdsu	11AM	Giovanni

	name	section	first
0	Anya Iatypd	10AM	Anya
1	Nathaniel Kcyrfu	11AM	Nathaniel
2	Jae Oadpmw	10AM	Jae
...	...	...	...
347	Danny Zsoyxb	10AM	Danny
348	Alex Lrmwwt	11AM	Alex
349	Giovanni Ibkdsu	11AM	Giovanni

		name
section	first
10AM	Aahil	1
	Aishwarya	1
	Alejandro	1
...	...	...
9AM	Zack	1
	Zeyuan	1
	Zhanlin	1

	section	first	name
0	10AM	Aahil	1
1	10AM	Aishwarya	1
2	10AM	Alejandro	1
...	...	...	...
334	9AM	Zack	1
335	9AM	Zeyuan	1
336	9AM	Zhanlin	1

Lecture 9 – Grouping with Subgroups, Merging¶

DSC 10, Winter 2023¶

Announcements¶

Homework 2 Question 2.4¶

Agenda¶

Grouping with subgroups¶

DSC 10 student data¶

How many students named `'Ryan'` are in each section?¶

How many students with each first name does each lecture section have?¶

`.groupby` with subgroups¶

Notice the index... 🤔¶

Does order matter?¶

Activity¶

Activity¶

New dataset: Sea temperatures 🌊¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Plots of monthly and yearly average surface temperature 📈¶

Summary: `.groupby` with subgroups¶

Merging 🚗¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

What just happened!? 🤯¶

`.merge`¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

Shortcut if column names are the same: `on`¶

Does order matter? 🤔¶

What if we want to "merge on" an index?¶

Activity setup¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Followup activity¶

Summary, next time¶

Summary¶

Next time¶

	first	section	name
0	Aahil	10AM	1
1	Abhay	9AM	1
2	Aditi	11AM	1
...	...	...	...
334	Zixuan	10AM	1
335	Ziyao	10AM	1
336	Zoe	11AM	1

	YEAR	MONTH	DAY	SURFACE_TEMP
0	1916	8	22	19.5
1	1916	8	23	19.9
2	1916	8	24	19.7
...	...	...	...	...
37195	2021	12	29	15.0
37196	2021	12	30	15.1
37197	2021	12	31	15.4

	Model	Price	Screen
0	iPhone 13	799	6.1
1	iPhone 13 Pro Max	1099	6.7
2	Samsung Galaxy Z Flip	999	6.7
3	Pixel 5a	449	6.3

	Handset	Units	Store
0	iPhone 13 Pro Max	50	Westfield UTC
1	iPhone 13	40	Westfield UTC
2	Pixel 5a	10	Fashion Valley
3	iPhone 13	100	Downtown

	Units	Store
Handset
iPhone 13 Pro Max	50	Westfield UTC
iPhone 13	40	Westfield UTC
Pixel 5a	10	Fashion Valley
iPhone 13	100	Downtown

	city	state	today_high_temp
0	La Jolla	California	79
1	San Diego	California	83
2	Austin	Texas	87
3	Los Angeles	California	87

	name	city	state	graduation_rate
0	UCSD	La Jolla	California	0.87
1	University of Chicago	Chicago	Illinois	0.94
2	University of San Diego	San Diego	California	0.78
3	Johns Hopkins University	Baltimore	Maryland	0.92
4	UT Austin	Austin	Texas	0.81
5	SDSU	San Diego	California	0.83
6	UCLA	Los Angeles	California	0.91

Lecture 9 – Grouping with Subgroups, Merging¶

DSC 10, Winter 2023¶

Announcements¶

Homework 2 Question 2.4¶

Agenda¶

Grouping with subgroups¶

DSC 10 student data¶

How many students named 'Ryan' are in each section?¶

How many students with each first name does each lecture section have?¶

.groupby with subgroups¶

Notice the index... 🤔¶

Does order matter?¶

Activity¶

Activity¶

New dataset: Sea temperatures 🌊¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Plots of monthly and yearly average surface temperature 📈¶

Summary: .groupby with subgroups¶

Merging 🚗¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

What just happened!? 🤯¶

.merge¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

Shortcut if column names are the same: on¶

Does order matter? 🤔¶

What if we want to "merge on" an index?¶

Activity setup¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Followup activity¶

Summary, next time¶

Summary¶

Next time¶

How many students named `'Ryan'` are in each section?¶

`.groupby` with subgroups¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Summary: `.groupby` with subgroups¶

`.merge`¶

Shortcut if column names are the same: `on`¶

Concept Check ✅ – Answer at cc.dsc10.com ¶