# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
%reload_ext pandas_tutor
%set_pandas_tutor_options {'projectorMode': True}
set_matplotlib_formats("svg")
plt.style.use('ggplot')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
from IPython.display import display, IFrame
def merging_animation():
src="https://docs.google.com/presentation/d/e/2PACX-1vSk2FfJ4K_An_CQwcN_Yu5unpJckOZjVQDFqZ78ZTTMmowUsCQKKVnum0_m6TaiGquQ44E3FiS9g2Y4/embed?start=false&loop=false&delayms=60000&rm=minimal"
width=825
height=500
display(IFrame(src, width, height))
roster = bpd.read_csv('data/roster-anon.csv')
roster
name | section | |
---|---|---|
0 | Derrick Gernlq | 1PM |
1 | Tommy Vbpsht | 12PM |
2 | Grace Smgsmb | 12PM |
... | ... | ... |
273 | Norah Pcqynf | 12PM |
274 | Harry Jwofgg | 1PM |
275 | Zhe Ltynpn | 1PM |
276 rows × 2 columns
Recall, last class, we extracted the first name of each student in the class.
def first_name(full_name):
'''Returns the first name given a full name.'''
return full_name.split(' ')[0]
roster = roster.assign(
first=roster.get('name').apply(first_name)
)
roster
name | section | first | |
---|---|---|---|
0 | Derrick Gernlq | 1PM | Derrick |
1 | Tommy Vbpsht | 12PM | Tommy |
2 | Grace Smgsmb | 12PM | Grace |
... | ... | ... | ... |
273 | Norah Pcqynf | 12PM | Norah |
274 | Harry Jwofgg | 1PM | Harry |
275 | Zhe Ltynpn | 1PM | Zhe |
276 rows × 3 columns
We discovered that Ryan is the most popular first name overall.
name_counts = (
roster
.groupby('first')
.count()
.sort_values('name', ascending=False)
.get(['name'])
)
name_counts
name | |
---|---|
first | |
Ryan | 6 |
Andrew | 4 |
Grace | 3 |
... | ... |
Jared | 1 |
Jasnoor | 1 |
Zixuan | 1 |
250 rows × 1 columns
To find the number of 'Ryan'
s in each lecture section, we can query for only the rows corresponding to 'Ryan'
s, and then group by 'section'
.
roster[roster.get('first') == 'Ryan'].groupby('section').count()
name | first | |
---|---|---|
section | ||
12PM | 2 | 2 |
1PM | 4 | 4 |
But what if we want to know the number of 'Marie'
s and 'Ethan'
s per section, too?
roster[roster.get('first') == 'Marie'].groupby('section').count()
name | first | |
---|---|---|
section | ||
12PM | 1 | 1 |
1PM | 1 | 1 |
roster[roster.get('first') == 'Ethan'].groupby('section').count()
name | first | |
---|---|---|
section | ||
12PM | 2 | 2 |
1PM | 1 | 1 |
Is there a way to do this for all first names and sections all at once?
roster
by 'first'
.# One row per unique first name.
roster.groupby('first').count().get('name')
first Aaron 2 Aarush 1 Abhishek 1 .. Zihan 1 Zike 1 Zixuan 1 Name: name, Length: 250, dtype: int64
roster
by 'section'
.# One row per unique section.
roster.groupby('section').count().get('name')
section 12PM 139 1PM 137 Name: name, dtype: int64
'Ryan'
s in the 12PM section or the number of 'Marie'
s in the 1PM section.'first'
and 'section'
– and we can!roster
name | section | first | |
---|---|---|---|
0 | Derrick Gernlq | 1PM | Derrick |
1 | Tommy Vbpsht | 12PM | Tommy |
2 | Grace Smgsmb | 12PM | Grace |
... | ... | ... | ... |
273 | Norah Pcqynf | 12PM | Norah |
274 | Harry Jwofgg | 1PM | Harry |
275 | Zhe Ltynpn | 1PM | Zhe |
276 rows × 3 columns
We can pass a list of column names to .groupby
!
roster.groupby(['section', 'first']).count()
name | ||
---|---|---|
section | first | |
12PM | Aaron | 2 |
Abhishek | 1 | |
Adriana | 1 | |
... | ... | ... |
1PM | Zhe | 1 |
Zifei | 1 | |
Zixuan | 1 |
259 rows × 1 columns
The above DataFrame is telling us, for instance, that there are 2 students with the first name 'Aaron'
in the 12PM section.
It is not saying that there are only 2 'Aaron'
s in the course overall. There could be more in the 1PM section.
.groupby
:df.groupby([col_1, col_2, ..., col_k])
col_1
first. Within each group, group by col_2
, and so on.'section'
and 'first'
.'section'
, and within each section, we grouped by 'first'
.'section'
and 'first'
..reset_index()
to "flatten" our DataFrame back to normal.roster.groupby(['section', 'first']).count().reset_index()
section | first | name | |
---|---|---|---|
0 | 12PM | Aaron | 2 |
1 | 12PM | Abhishek | 1 |
2 | 12PM | Adriana | 1 |
... | ... | ... | ... |
256 | 1PM | Zhe | 1 |
257 | 1PM | Zifei | 1 |
258 | 1PM | Zixuan | 1 |
259 rows × 3 columns
roster.groupby(['section', 'first']).count().reset_index()
section | first | name | |
---|---|---|---|
0 | 12PM | Aaron | 2 |
1 | 12PM | Abhishek | 1 |
2 | 12PM | Adriana | 1 |
... | ... | ... | ... |
256 | 1PM | Zhe | 1 |
257 | 1PM | Zifei | 1 |
258 | 1PM | Zixuan | 1 |
259 rows × 3 columns
roster.groupby(['first', 'section']).count().reset_index()
first | section | name | |
---|---|---|---|
0 | Aaron | 12PM | 2 |
1 | Aarush | 1PM | 1 |
2 | Abhishek | 12PM | 1 |
... | ... | ... | ... |
256 | Zihan | 12PM | 1 |
257 | Zike | 12PM | 1 |
258 | Zixuan | 1PM | 1 |
259 rows × 3 columns
Answer: Kind of. The order of the rows and columns will be different, but the content will be the same.
Using counts
, find the lecture section with the most 'Ryan'
s.
ryan_counts = counts[counts.get('first') == 'Ryan'] ryan_counts.sort_values('name', ascending=False).get('section').iloc[0]
counts = roster.groupby(['section', 'first']).count().reset_index()
counts
section | first | name | |
---|---|---|---|
0 | 12PM | Aaron | 2 |
1 | 12PM | Abhishek | 1 |
2 | 12PM | Adriana | 1 |
... | ... | ... | ... |
256 | 1PM | Zhe | 1 |
257 | 1PM | Zifei | 1 |
258 | 1PM | Zixuan | 1 |
259 rows × 3 columns
...
Ellipsis
Using counts
, find the shortest first name in the class that is shared by at least two students in the same section.
with_len = counts.assign(length=counts.get('first').apply(len)) with_len[with_len.get('name') >= 2].sort_values('length') # Turns out there are two possible answers: Ryan and Alec.
...
Ellipsis
This dataset contains the sea surface temperature in La Jolla, on many days ranging from August 22, 1916 to December 31, 2021.
sea_temp = bpd.read_csv('data/sea_temp.csv')
sea_temp
YEAR | MONTH | DAY | SURFACE_TEMP | |
---|---|---|---|---|
0 | 1916 | 8 | 22 | 19.5 |
1 | 1916 | 8 | 23 | 19.9 |
2 | 1916 | 8 | 24 | 19.7 |
... | ... | ... | ... | ... |
37195 | 2021 | 12 | 29 | 15.0 |
37196 | 2021 | 12 | 30 | 15.1 |
37197 | 2021 | 12 | 31 | 15.4 |
37198 rows × 4 columns
We want to find the single month (e.g. November 1998) with the highest average 'SURFACE_TEMP'
.
Which of the following would help us achieve this goal?
A. sea_temp.groupby('SURFACE_TEMP').mean()
B. sea_temp.groupby('MONTH').mean()
C. sea_temp.groupby(['YEAR', 'MONTH']).mean()
D. sea_temp.groupby(['MONTH', 'DAY']).mean()
E. sea_temp.groupby(['MONTH', 'SURFACE_TEMP']).mean()
...
Ellipsis
(sea_temp
.groupby('MONTH')
.mean()
.plot(kind='line', y='SURFACE_TEMP')
);
(sea_temp
.groupby('YEAR')
.mean()
.plot(kind='line', y='SURFACE_TEMP')
);
.groupby
to group on multiple columns. This creates groups within groups..reset_index()
after grouping on multiple columns to move the MultiIndex back to the columns.phones = bpd.DataFrame().assign(
Model=['iPhone 13', 'iPhone 13 Pro Max', 'Samsung Galaxy Z Flip', 'Pixel 5a'],
Price=[799, 1099, 999, 449],
Screen=[6.1, 6.7, 6.7, 6.3]
)
inventory = bpd.DataFrame().assign(
Handset=['iPhone 13 Pro Max', 'iPhone 13', 'Pixel 5a', 'iPhone 13'],
Units=[50, 40, 10, 100],
Store=['Westfield UTC', 'Westfield UTC', 'Fashion Valley', 'Downtown']
)
# Phones on the market right now.
phones
Model | Price | Screen | |
---|---|---|---|
0 | iPhone 13 | 799 | 6.1 |
1 | iPhone 13 Pro Max | 1099 | 6.7 |
2 | Samsung Galaxy Z Flip | 999 | 6.7 |
3 | Pixel 5a | 449 | 6.3 |
# Which phones my stores have in stock in the area.
inventory
Handset | Units | Store | |
---|---|---|---|
0 | iPhone 13 Pro Max | 50 | Westfield UTC |
1 | iPhone 13 | 40 | Westfield UTC |
2 | Pixel 5a | 10 | Fashion Valley |
3 | iPhone 13 | 100 | Downtown |
phones.merge(inventory, left_on='Model', right_on='Handset')
Model | Price | Screen | Handset | Units | Store | |
---|---|---|---|---|---|---|
0 | iPhone 13 | 799 | 6.1 | iPhone 13 | 40 | Westfield UTC |
1 | iPhone 13 | 799 | 6.1 | iPhone 13 | 100 | Downtown |
2 | iPhone 13 Pro Max | 1099 | 6.7 | iPhone 13 Pro Max | 50 | Westfield UTC |
3 | Pixel 5a | 449 | 6.3 | Pixel 5a | 10 | Fashion Valley |
# Click through the presentation that appears.
merging_animation()
.merge
¶To "merge" two DataFrames:
left_df.merge(
right_df,
left_on='left_col_name',
right_on='right_col_name'
)
left_on
and right_on
should be column names (they don't have to be the same).phones
Model | Price | Screen | |
---|---|---|---|
0 | iPhone 13 | 799 | 6.1 |
1 | iPhone 13 Pro Max | 1099 | 6.7 |
2 | Samsung Galaxy Z Flip | 999 | 6.7 |
3 | Pixel 5a | 449 | 6.3 |
inventory
Handset | Units | Store | |
---|---|---|---|
0 | iPhone 13 Pro Max | 50 | Westfield UTC |
1 | iPhone 13 | 40 | Westfield UTC |
2 | Pixel 5a | 10 | Fashion Valley |
3 | iPhone 13 | 100 | Downtown |
# Notice there's no Samsung Galaxy Z Flip in phones_merged!
phones_merged = phones.merge(inventory, left_on='Model', right_on='Handset')
phones_merged
Model | Price | Screen | Handset | Units | Store | |
---|---|---|---|---|---|---|
0 | iPhone 13 | 799 | 6.1 | iPhone 13 | 40 | Westfield UTC |
1 | iPhone 13 | 799 | 6.1 | iPhone 13 | 100 | Downtown |
2 | iPhone 13 Pro Max | 1099 | 6.7 | iPhone 13 Pro Max | 50 | Westfield UTC |
3 | Pixel 5a | 449 | 6.3 | Pixel 5a | 10 | Fashion Valley |
(phones_merged.get('Price') * phones_merged.get('Units')).sum()
171300
%%pt
to the top of a code cell to explain the last line of babypandas code.%%pt
phones.merge(inventory, left_on='Model', right_on='Handset')
%%pt
phones.merge(inventory, left_on='Model', right_on='Handset')
%%pt
inventory.merge(phones, left_on='Handset', right_on='Model')
Answer: The order of the rows and columns will be different, but the content will be the same.
If that's the case, then instead of using left_on='col'
and right_on='col'
, you can just say on='col'
.
phones
Model | Price | Screen | |
---|---|---|---|
0 | iPhone 13 | 799 | 6.1 |
1 | iPhone 13 Pro Max | 1099 | 6.7 |
2 | Samsung Galaxy Z Flip | 999 | 6.7 |
3 | Pixel 5a | 449 | 6.3 |
inventory_relabeled = inventory.assign(Model=inventory.get('Handset')).drop(columns=['Handset'])
inventory_relabeled
Units | Store | Model | |
---|---|---|---|
0 | 50 | Westfield UTC | iPhone 13 Pro Max |
1 | 40 | Westfield UTC | iPhone 13 |
2 | 10 | Fashion Valley | Pixel 5a |
3 | 100 | Downtown | iPhone 13 |
In this example, the column we want to merge on in both DataFrames is named 'Model'
, so we can just use on='Model'
.
phones.merge(inventory_relabeled, on='Model')
Model | Price | Screen | Units | Store | |
---|---|---|---|---|---|
0 | iPhone 13 | 799 | 6.1 | 40 | Westfield UTC |
1 | iPhone 13 | 799 | 6.1 | 100 | Downtown |
2 | iPhone 13 Pro Max | 1099 | 6.7 | 50 | Westfield UTC |
3 | Pixel 5a | 449 | 6.3 | 10 | Fashion Valley |
Notice: There's only one column containing phone names now.
Instead of using left_on
or right_on
, use left_index=True
or right_index=True
.
phones
Model | Price | Screen | |
---|---|---|---|
0 | iPhone 13 | 799 | 6.1 |
1 | iPhone 13 Pro Max | 1099 | 6.7 |
2 | Samsung Galaxy Z Flip | 999 | 6.7 |
3 | Pixel 5a | 449 | 6.3 |
inventory_by_handset = inventory.set_index('Handset')
inventory_by_handset
Units | Store | |
---|---|---|
Handset | ||
iPhone 13 Pro Max | 50 | Westfield UTC |
iPhone 13 | 40 | Westfield UTC |
Pixel 5a | 10 | Fashion Valley |
iPhone 13 | 100 | Downtown |
In this example, we want to merge using the 'Model'
column in the left DataFrame (phones
) and the index in the right DataFrame (inventory_by_handset
); note that 'Handset'
is not a column in the right DataFrame anymore.
phones.merge(inventory_by_handset, left_on='Model', right_index=True)
Model | Price | Screen | Units | Store | |
---|---|---|---|---|---|
0 | iPhone 13 | 799 | 6.1 | 40 | Westfield UTC |
0 | iPhone 13 | 799 | 6.1 | 100 | Downtown |
1 | iPhone 13 Pro Max | 1099 | 6.7 | 50 | Westfield UTC |
3 | Pixel 5a | 449 | 6.3 | 10 | Fashion Valley |
nice_weather_cities = bpd.DataFrame().assign(
city=['La Jolla', 'San Diego', 'Austin', 'Los Angeles'],
state=['California', 'California', 'Texas', 'California'],
today_high_temp=['79', '83', '87', '87']
)
schools = bpd.DataFrame().assign(
name=['UCSD', 'University of Chicago', 'University of San Diego','Johns Hopkins University', 'UT Austin', 'SDSU', 'UCLA'],
city=['La Jolla', 'Chicago', 'San Diego', 'Baltimore', 'Austin', 'San Diego', 'Los Angeles'],
state=['California', 'Illinois', 'California', 'Maryland', 'Texas', 'California', 'California'],
graduation_rate=[0.87, 0.94, 0.78, 0.92, 0.81, 0.83, 0.91 ]
)
Without writing code, how many rows are in nice_weather_cities.merge(schools, on='city')
?
A. 4 B. 5 C. 6 D. 7 E. 8
nice_weather_cities
city | state | today_high_temp | |
---|---|---|---|
0 | La Jolla | California | 79 |
1 | San Diego | California | 83 |
2 | Austin | Texas | 87 |
3 | Los Angeles | California | 87 |
schools
name | city | state | graduation_rate | |
---|---|---|---|---|
0 | UCSD | La Jolla | California | 0.87 |
1 | University of Chicago | Chicago | Illinois | 0.94 |
2 | University of San Diego | San Diego | California | 0.78 |
3 | Johns Hopkins University | Baltimore | Maryland | 0.92 |
4 | UT Austin | Austin | Texas | 0.81 |
5 | SDSU | San Diego | California | 0.83 |
6 | UCLA | Los Angeles | California | 0.91 |
Without writing code, how many rows are in nice_weather_cities.merge(schools, on='state')
?
nice_weather_cities
city | state | today_high_temp | |
---|---|---|---|
0 | La Jolla | California | 79 |
1 | San Diego | California | 83 |
2 | Austin | Texas | 87 |
3 | Los Angeles | California | 87 |
schools
name | city | state | graduation_rate | |
---|---|---|---|---|
0 | UCSD | La Jolla | California | 0.87 |
1 | University of Chicago | Chicago | Illinois | 0.94 |
2 | University of San Diego | San Diego | California | 0.78 |
3 | Johns Hopkins University | Baltimore | Maryland | 0.92 |
4 | UT Austin | Austin | Texas | 0.81 |
5 | SDSU | San Diego | California | 0.83 |
6 | UCLA | Los Angeles | California | 0.91 |
nice_weather_cities.merge(schools, on='state')
city_x | state | today_high_temp | name | city_y | graduation_rate | |
---|---|---|---|---|---|---|
0 | La Jolla | California | 79 | UCSD | La Jolla | 0.87 |
1 | La Jolla | California | 79 | University of San Diego | San Diego | 0.78 |
2 | La Jolla | California | 79 | SDSU | San Diego | 0.83 |
... | ... | ... | ... | ... | ... | ... |
10 | Los Angeles | California | 87 | SDSU | San Diego | 0.83 |
11 | Los Angeles | California | 87 | UCLA | Los Angeles | 0.91 |
12 | Austin | Texas | 87 | UT Austin | Austin | 0.81 |
13 rows × 6 columns
nice_weather_cities.merge(schools, on='state').shape[0]
13
Here are related exam problems to help you practice merging:
.groupby
..merge
..merge
, Python searches for a match between a specified column in each DataFrame and combines the rows with a match.