# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
%reload_ext pandas_tutor
%set_pandas_tutor_options {'projectorMode': True}
set_matplotlib_formats("svg")
plt.style.use('ggplot')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
from IPython.display import display, IFrame
def merging_animation():
src="https://docs.google.com/presentation/d/e/2PACX-1vSk2FfJ4K_An_CQwcN_Yu5unpJckOZjVQDFqZ78ZTTMmowUsCQKKVnum0_m6TaiGquQ44E3FiS9g2Y4/embed?start=false&loop=false&delayms=60000"
width=825
height=500
display(IFrame(src, width, height))
roster = bpd.read_csv('data/roster-anon.csv')
roster
name | section | |
---|---|---|
0 | Levy Dmxsqj | 11AM |
1 | Aiden Nyozzx | 1PM |
2 | Sruti Fivolq | 12PM |
... | ... | ... |
408 | Leni Hlfjhh | 11AM |
409 | Dory Xaghsk | 1PM |
410 | Laura Xfqwzu | 11AM |
411 rows × 2 columns
Recall, last class, we extracted the first name of each student in the class.
def first_name(full_name):
'''Returns the first name given a full name.'''
return full_name.split(' ')[0]
with_first = roster.assign(
first=roster.get('name').apply(first_name)
)
with_first
name | section | first | |
---|---|---|---|
0 | Levy Dmxsqj | 11AM | Levy |
1 | Aiden Nyozzx | 1PM | Aiden |
2 | Sruti Fivolq | 12PM | Sruti |
... | ... | ... | ... |
408 | Leni Hlfjhh | 11AM | Leni |
409 | Dory Xaghsk | 1PM | Dory |
410 | Laura Xfqwzu | 11AM | Laura |
411 rows × 3 columns
'Ethan'
are in each section?¶We discovered that 'Ethan'
is the most popular first name overall.
first_counts = (with_first.groupby('first').count()
.sort_values('name', ascending=False)
.get(['name']))
first_counts
name | |
---|---|
first | |
Ethan | 5 |
Steven | 4 |
Jason | 4 |
... | ... |
Huanchang | 1 |
Housheng | 1 |
Zoya | 1 |
361 rows × 1 columns
To find the number of 'Ethan'
s in each lecture section, we can query for only the rows corresponding to 'Ethan'
s, and then group by 'section'
.
with_first[with_first.get('first') == 'Ethan'].groupby('section').count()
name | first | |
---|---|---|
section | ||
10AM | 1 | 1 |
11AM | 1 | 1 |
1PM | 3 | 3 |
But what if we want to know the number of 'Emily'
s and 'Yuen'
s per section, too?
with_first[with_first.get('first') == 'Emily'].groupby('section').count()
name | first | |
---|---|---|
section | ||
10AM | 1 | 1 |
1PM | 1 | 1 |
with_first[with_first.get('first') == 'Yuen'].groupby('section').count()
name | first | |
---|---|---|
section | ||
1PM | 2 | 2 |
Is there a way to do this for all first names and sections all at once?
'Ethan'
s in the 1PM section, the number of 'Emily'
s in the 10AM section.with_first
name | section | first | |
---|---|---|---|
0 | Levy Dmxsqj | 11AM | Levy |
1 | Aiden Nyozzx | 1PM | Aiden |
2 | Sruti Fivolq | 12PM | Sruti |
... | ... | ... | ... |
408 | Leni Hlfjhh | 11AM | Leni |
409 | Dory Xaghsk | 1PM | Dory |
410 | Laura Xfqwzu | 11AM | Laura |
411 rows × 3 columns
with_first.groupby(['section', 'first']).count()
name | ||
---|---|---|
section | first | |
10AM | Adrian | 1 |
Ahmed | 1 | |
Akshay | 1 | |
... | ... | ... |
1PM | Zilu | 1 |
Ziwei | 1 | |
Ziyu | 1 |
393 rows × 1 columns
The above DataFrame is telling us, for instance, that there is 1 student with the first name 'Adrian'
in the 10AM section.
It is not saying that there is only 1 'Adrian'
in the course overall – in fact, there are 2!
with_first[with_first.get('first') == 'Adrian']
name | section | first | |
---|---|---|---|
37 | Adrian Yombcy | 10AM | Adrian |
332 | Adrian Rxppvf | 11AM | Adrian |
.groupby
with subgroups¶.groupby
:df.groupby([col_1, col_2, ..., col_k])
col_1
first.col_2
, and so on.'section'
and 'first'
..reset_index()
to "flatten" our DataFrame back to normal.with_first.groupby(['section', 'first']).count().reset_index()
section | first | name | |
---|---|---|---|
0 | 10AM | Adrian | 1 |
1 | 10AM | Ahmed | 1 |
2 | 10AM | Akshay | 1 |
... | ... | ... | ... |
390 | 1PM | Zilu | 1 |
391 | 1PM | Ziwei | 1 |
392 | 1PM | Ziyu | 1 |
393 rows × 3 columns
with_first.groupby(['section', 'first']).count().reset_index()
section | first | name | |
---|---|---|---|
0 | 10AM | Adrian | 1 |
1 | 10AM | Ahmed | 1 |
2 | 10AM | Akshay | 1 |
... | ... | ... | ... |
390 | 1PM | Zilu | 1 |
391 | 1PM | Ziwei | 1 |
392 | 1PM | Ziyu | 1 |
393 rows × 3 columns
with_first.groupby(['first', 'section']).count().reset_index()
first | section | name | |
---|---|---|---|
0 | Aaron | 11AM | 1 |
1 | Abdulrahim | 11AM | 1 |
2 | Abigail | 11AM | 1 |
... | ... | ... | ... |
390 | Ziyu | 1PM | 1 |
391 | Zoey | 11AM | 1 |
392 | Zoya | 11AM | 1 |
393 rows × 3 columns
Answer: Kind of. The order of the rows and columns will be different, but the content will be the same.
Using counts
, find the lecture section with the most 'Ryan'
s.
counts = with_first.groupby(['section', 'first']).count().reset_index()
counts
section | first | name | |
---|---|---|---|
0 | 10AM | Adrian | 1 |
1 | 10AM | Ahmed | 1 |
2 | 10AM | Akshay | 1 |
... | ... | ... | ... |
390 | 1PM | Zilu | 1 |
391 | 1PM | Ziwei | 1 |
392 | 1PM | Ziyu | 1 |
393 rows × 3 columns
...
Ellipsis
Using counts
, find the longest first name in the class that is shared by at least two students in the same section.
Note: This was an activity in the last class. There, we had to use our shared_first_and_section
function; that's not needed here!
counts
section | first | name | |
---|---|---|---|
0 | 10AM | Adrian | 1 |
1 | 10AM | Ahmed | 1 |
2 | 10AM | Akshay | 1 |
... | ... | ... | ... |
390 | 1PM | Zilu | 1 |
391 | 1PM | Ziwei | 1 |
392 | 1PM | Ziyu | 1 |
393 rows × 3 columns
...
Ellipsis
This dataset contains the sea surface temperature in La Jolla, on many days ranging from August 22, 1916 to December 31, 2020.
sea_temp = bpd.read_csv('data/sea_temp.csv')
sea_temp
YEAR | MONTH | DAY | SURFACE_TEMP | |
---|---|---|---|---|
0 | 1916 | 8 | 22 | 19.5 |
1 | 1916 | 8 | 23 | 19.9 |
2 | 1916 | 8 | 24 | 19.7 |
... | ... | ... | ... | ... |
36839 | 2020 | 12 | 29 | 14.9 |
36840 | 2020 | 12 | 30 | 15.0 |
36841 | 2020 | 12 | 31 | 14.8 |
36842 rows × 4 columns
We want to find the single month (e.g. November 1998) with the highest average 'SURFACE_TEMP'
.
Which of the following would help us achieve this goal?
A. sea_temp.groupby('SURFACE_TEMP').mean()
B. sea_temp.groupby('MONTH').mean()
C. sea_temp.groupby(['YEAR', 'MONTH']).mean()
D. sea_temp.groupby(['MONTH', 'DAY']).mean()
E. sea_temp.groupby(['MONTH', 'SURFACE_TEMP']).mean()
...
Ellipsis
(sea_temp
.groupby('MONTH')
.mean()
.plot(kind='line', y='SURFACE_TEMP')
);
(sea_temp
.groupby('YEAR')
.mean()
.plot(kind='line', y='SURFACE_TEMP')
);
.groupby
with subgroups¶.groupby
to make subgroups..reset_index()
after grouping with subgroups to move the MultiIndex back to the columns.phones = bpd.DataFrame().assign(
Model=['iPhone 13', 'iPhone 13 Pro Max', 'Samsung Galaxy Z Flip', 'Pixel 5a'],
Price=[799, 1099, 999, 449],
Screen=[6.1, 6.7, 6.7, 6.3]
)
inventory = bpd.DataFrame().assign(
Handset=['iPhone 13 Pro Max', 'iPhone 13', 'Pixel 5a', 'iPhone 13'],
Units=[50, 40, 10, 100],
Store=['Westfield UTC', 'Westfield UTC', 'Fashion Valley', 'Downtown']
)
# Phones on the market right now
phones
Model | Price | Screen | |
---|---|---|---|
0 | iPhone 13 | 799 | 6.1 |
1 | iPhone 13 Pro Max | 1099 | 6.7 |
2 | Samsung Galaxy Z Flip | 999 | 6.7 |
3 | Pixel 5a | 449 | 6.3 |
# Which phones my stores have in stock in the area
inventory
Handset | Units | Store | |
---|---|---|---|
0 | iPhone 13 Pro Max | 50 | Westfield UTC |
1 | iPhone 13 | 40 | Westfield UTC |
2 | Pixel 5a | 10 | Fashion Valley |
3 | iPhone 13 | 100 | Downtown |
Question: If I sell all of the phones in my inventory, how much will I make in revenue?
phones.merge(inventory, left_on='Model', right_on='Handset')
Model | Price | Screen | Handset | Units | Store | |
---|---|---|---|---|---|---|
0 | iPhone 13 | 799 | 6.1 | iPhone 13 | 40 | Westfield UTC |
1 | iPhone 13 | 799 | 6.1 | iPhone 13 | 100 | Downtown |
2 | iPhone 13 Pro Max | 1099 | 6.7 | iPhone 13 Pro Max | 50 | Westfield UTC |
3 | Pixel 5a | 449 | 6.3 | Pixel 5a | 10 | Fashion Valley |
# Click through the presentation that appears
merging_animation()
.merge
¶left_df.merge(
right_df,
left_on=left_column_name,
right_on=right_column_name
)
left_on
and right_on
should be column names (they don't have to be the same).%%pt
# Notice there's no Samsung Galaxy Z Flip in phones_merged
phones_merged = phones.merge(inventory, left_on='Model', right_on='Handset')
(phones_merged.get('Price') * phones_merged.get('Units')).sum()
171300
on
¶inventory_relabeled = inventory.assign(Model=inventory.get('Handset')).drop(columns=['Handset'])
inventory_relabeled
Units | Store | Model | |
---|---|---|---|
0 | 50 | Westfield UTC | iPhone 13 Pro Max |
1 | 40 | Westfield UTC | iPhone 13 |
2 | 10 | Fashion Valley | Pixel 5a |
3 | 100 | Downtown | iPhone 13 |
phones.merge(inventory_relabeled, on='Model')
Model | Price | Screen | Units | Store | |
---|---|---|---|---|---|
0 | iPhone 13 | 799 | 6.1 | 40 | Westfield UTC |
1 | iPhone 13 | 799 | 6.1 | 100 | Downtown |
2 | iPhone 13 Pro Max | 1099 | 6.7 | 50 | Westfield UTC |
3 | Pixel 5a | 449 | 6.3 | 10 | Fashion Valley |
Notice: There's only one column containing phone names now.
%%pt
inventory.merge(phones, left_on='Handset', right_on='Model')
Answer: The order of the rows and columns will be different, but the content will be the same.
Instead of using left_on
or right_on
, use left_index=True
or right_index=True
.
phones
Model | Price | Screen | |
---|---|---|---|
0 | iPhone 13 | 799 | 6.1 |
1 | iPhone 13 Pro Max | 1099 | 6.7 |
2 | Samsung Galaxy Z Flip | 999 | 6.7 |
3 | Pixel 5a | 449 | 6.3 |
inventory_by_handset = inventory.set_index('Handset')
inventory_by_handset
Units | Store | |
---|---|---|
Handset | ||
iPhone 13 Pro Max | 50 | Westfield UTC |
iPhone 13 | 40 | Westfield UTC |
Pixel 5a | 10 | Fashion Valley |
iPhone 13 | 100 | Downtown |
phones.merge(inventory_by_handset, left_on='Model', right_index=True)
Model | Price | Screen | Units | Store | |
---|---|---|---|---|---|
0 | iPhone 13 | 799 | 6.1 | 40 | Westfield UTC |
0 | iPhone 13 | 799 | 6.1 | 100 | Downtown |
1 | iPhone 13 Pro Max | 1099 | 6.7 | 50 | Westfield UTC |
3 | Pixel 5a | 449 | 6.3 | 10 | Fashion Valley |
nice_weather_cities = bpd.DataFrame().assign(
city=['La Jolla', 'San Diego', 'Austin', 'Los Angeles'],
state=['California', 'California', 'Texas', 'California'],
today_high_temp=['79', '83', '87', '87']
)
schools = bpd.DataFrame().assign(
name=['UCSD', 'University of Chicago', 'University of San Diego','Johns Hopkins University', 'UT Austin', 'SDSU', 'UCLA'],
city=['La Jolla', 'Chicago', 'San Diego', 'Baltimore', 'Austin', 'San Diego', 'Los Angeles'],
state=['California', 'Illinois', 'California', 'Maryland', 'Texas', 'California', 'California'],
graduation_rate=[0.87, 0.94, 0.78, 0.92, 0.81, 0.83, 0.91 ]
)
Without writing code, how many rows are in nice_weather_cities.merge(schools, on='city')
?
A. 4 B. 5 C. 6 D. 7 E. 8
nice_weather_cities
city | state | today_high_temp | |
---|---|---|---|
0 | La Jolla | California | 79 |
1 | San Diego | California | 83 |
2 | Austin | Texas | 87 |
3 | Los Angeles | California | 87 |
schools
name | city | state | graduation_rate | |
---|---|---|---|---|
0 | UCSD | La Jolla | California | 0.87 |
1 | University of Chicago | Chicago | Illinois | 0.94 |
2 | University of San Diego | San Diego | California | 0.78 |
3 | Johns Hopkins University | Baltimore | Maryland | 0.92 |
4 | UT Austin | Austin | Texas | 0.81 |
5 | SDSU | San Diego | California | 0.83 |
6 | UCLA | Los Angeles | California | 0.91 |
Without writing code, how many rows are in nice_weather_cities.merge(schools, on='state')
?
Hint: It's more than you might guess at first!
%%pt
nice_weather_cities.merge(schools, on='state')
nice_weather_cities.merge(schools, on='state').shape[0]
13
.groupby
..merge
..merge
, Python searches for a match between a specified column in each DataFrame and combines the rows with a match.