# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
from IPython.display import display, IFrame
def merging_animation():
src="https://docs.google.com/presentation/d/e/2PACX-1vSk2FfJ4K_An_CQwcN_Yu5unpJckOZjVQDFqZ78ZTTMmowUsCQKKVnum0_m6TaiGquQ44E3FiS9g2Y4/embed?start=false&loop=false&delayms=60000&rm=minimal"
width=825
height=500
display(IFrame(src, width, height))
roster = bpd.read_csv('data/roster-anon.csv')
roster
name | section | |
---|---|---|
0 | Kavya Fquroe | 10AM |
1 | Victoria Yppmzx | 10AM |
2 | An-Chi Tmbqlr | 8AM |
... | ... | ... |
522 | Mehri Osrvjq | 9AM |
523 | Noah Byphhr | 9AM |
524 | Emily Hchqii | 9AM |
525 rows × 2 columns
Recall, last class, we extracted the first name of each student in the class.
def first_name(full_name):
'''Returns the first name given a full name.'''
return full_name.split(' ')[0]
roster = roster.assign(
first=roster.get('name').apply(first_name)
)
roster
name | section | first | |
---|---|---|---|
0 | Kavya Fquroe | 10AM | Kavya |
1 | Victoria Yppmzx | 10AM | Victoria |
2 | An-Chi Tmbqlr | 8AM | An-Chi |
... | ... | ... | ... |
522 | Mehri Osrvjq | 9AM | Mehri |
523 | Noah Byphhr | 9AM | Noah |
524 | Emily Hchqii | 9AM | Emily |
525 rows × 3 columns
We discovered that Kevin is the most popular first name overall.
name_counts = (
roster
.groupby('first')
.count()
.sort_values('name', ascending=False)
.get(['name'])
)
name_counts
name | |
---|---|
first | |
Kevin | 7 |
Daniel | 6 |
Ryan | 6 |
... | ... |
Hengyu | 1 |
Heeju | 1 |
Zubin | 1 |
438 rows × 1 columns
To find the number of 'Kevin'
s in each lecture section, we can query for only the rows corresponding to 'Kevin'
s, and then group by 'section'
.
roster[roster.get('first') == 'Kevin'].groupby('section').count()
name | first | |
---|---|---|
section | ||
10AM | 1 | 1 |
1PM | 1 | 1 |
8AM | 1 | 1 |
9AM | 4 | 4 |
But what if we want to know the number of 'Victoria'
s and 'Ethan'
s per section, too?
roster[roster.get('first') == 'Victoria'].groupby('section').count()
name | first | |
---|---|---|
section | ||
10AM | 1 | 1 |
9AM | 1 | 1 |
roster[roster.get('first') == 'Ethan'].groupby('section').count()
name | first | |
---|---|---|
section | ||
10AM | 3 | 3 |
Is there a way to do this for all first names and sections all at once?
roster
by 'first'
.# One row per unique first name.
roster.groupby('first').count().get('name')
first Aadit 1 Aaron 1 Aarshia 1 .. Zhijie 1 Ziyan 1 Zubin 1 Name: name, Length: 438, dtype: int64
roster
by 'section'
.# One row per unique section.
roster.groupby('section').count().get('name')
section 10AM 150 1PM 150 8AM 75 9AM 150 Name: name, dtype: int64
'Kevin'
s in the 9AM section or the number of 'Victoria'
s in the 1PM section.'first'
and 'section'
– and we can!roster
name | section | first | |
---|---|---|---|
0 | Kavya Fquroe | 10AM | Kavya |
1 | Victoria Yppmzx | 10AM | Victoria |
2 | An-Chi Tmbqlr | 8AM | An-Chi |
... | ... | ... | ... |
522 | Mehri Osrvjq | 9AM | Mehri |
523 | Noah Byphhr | 9AM | Noah |
524 | Emily Hchqii | 9AM | Emily |
525 rows × 3 columns
We can pass a list of column names to .groupby
!
roster.groupby(['section', 'first']).count()
name | ||
---|---|---|
section | first | |
10AM | Aadit | 1 |
Aiden | 1 | |
Akshay | 1 | |
... | ... | ... |
9AM | Yun | 1 |
Zhaocheng | 1 | |
Ziyan | 1 |
497 rows × 1 columns
The above DataFrame is telling us, for instance, that there is 1 student with the first name 'Aadit'
in the 10AM section.
It is not saying that there is only one 'Aadit'
s in the course overall. There could be more in the other sections.
.groupby
:df.groupby(['col_1', 'col_2', ..., 'col_k'])
'col_1'
first. Within each group, group by 'col_2'
, and so on.'section'
and 'first'
.'section'
, and within each section, we grouped by 'first'
.'section'
and 'first'
..reset_index()
to "flatten" our DataFrame back to normal.roster.groupby(['section', 'first']).count().reset_index()
section | first | name | |
---|---|---|---|
0 | 10AM | Aadit | 1 |
1 | 10AM | Aiden | 1 |
2 | 10AM | Akshay | 1 |
... | ... | ... | ... |
494 | 9AM | Yun | 1 |
495 | 9AM | Zhaocheng | 1 |
496 | 9AM | Ziyan | 1 |
497 rows × 3 columns
roster.groupby(['section', 'first']).count().reset_index()
section | first | name | |
---|---|---|---|
0 | 10AM | Aadit | 1 |
1 | 10AM | Aiden | 1 |
2 | 10AM | Akshay | 1 |
... | ... | ... | ... |
494 | 9AM | Yun | 1 |
495 | 9AM | Zhaocheng | 1 |
496 | 9AM | Ziyan | 1 |
497 rows × 3 columns
roster.groupby(['first', 'section']).count().reset_index()
first | section | name | |
---|---|---|---|
0 | Aadit | 10AM | 1 |
1 | Aaron | 8AM | 1 |
2 | Aarshia | 1PM | 1 |
... | ... | ... | ... |
494 | Zhijie | 1PM | 1 |
495 | Ziyan | 9AM | 1 |
496 | Zubin | 10AM | 1 |
497 rows × 3 columns
Answer: Kind of. The order of the rows and columns will be different, but the content will be the same.
Using counts
, find the lecture section with the most 'Kevin'
s.
kevin_counts = counts[counts.get('first') == 'Kevin'] kevin_counts.sort_values('name', ascending=False).get('section').iloc[0]
counts = roster.groupby(['section', 'first']).count().reset_index()
counts
section | first | name | |
---|---|---|---|
0 | 10AM | Aadit | 1 |
1 | 10AM | Aiden | 1 |
2 | 10AM | Akshay | 1 |
... | ... | ... | ... |
494 | 9AM | Yun | 1 |
495 | 9AM | Zhaocheng | 1 |
496 | 9AM | Ziyan | 1 |
497 rows × 3 columns
...
Ellipsis
Using counts
, find the shortest first name in the class that is shared by at least two students in the same section.
with_len = counts.assign(length=counts.get('first').apply(len)) with_len[with_len.get('name') >= 2].sort_values('length') # Turns out there are many possible answers: Eric, Ryan, Evan and John.
...
Ellipsis
This dataset contains the sea surface temperature in La Jolla, on many days ranging from August 22, 1916 to March 31, 2023.
sea_temp = bpd.read_csv('data/sea_temp.csv')
sea_temp
YEAR | MONTH | DAY | SURFACE_TEMP | |
---|---|---|---|---|
0 | 1916 | 8 | 22 | 19.5 |
1 | 1916 | 8 | 23 | 19.9 |
2 | 1916 | 8 | 24 | 19.7 |
... | ... | ... | ... | ... |
37648 | 2023 | 3 | 29 | 14.4 |
37649 | 2023 | 3 | 30 | 14.8 |
37650 | 2023 | 3 | 31 | 15.1 |
37651 rows × 4 columns
We want to find the single month (e.g. November 1998) with the highest average 'SURFACE_TEMP'
.
Which of the following would help us achieve this goal?
A. sea_temp.groupby('SURFACE_TEMP').mean()
B. sea_temp.groupby('MONTH').mean()
C. sea_temp.groupby(['YEAR', 'MONTH']).mean()
D. sea_temp.groupby(['MONTH', 'DAY']).mean()
E. sea_temp.groupby(['MONTH', 'SURFACE_TEMP']).mean()
...
Ellipsis
(sea_temp
.groupby('MONTH')
.mean()
.plot(kind='line', y='SURFACE_TEMP')
);
# Why is there a sudden drop at the end? Look at the dates of data collection!
(sea_temp
.groupby('YEAR')
.mean()
.plot(kind='line', y='SURFACE_TEMP')
);
.groupby
to group on multiple columns. This creates groups within groups..reset_index()
after grouping on multiple columns to move the MultiIndex back to the columns.phones = bpd.DataFrame().assign(
Model=['iPhone 13', 'iPhone 13 Pro Max', 'Samsung Galaxy Z Flip', 'Pixel 5a'],
Price=[799, 1099, 999, 449],
Screen=[6.1, 6.7, 6.7, 6.3]
)
inventory = bpd.DataFrame().assign(
Handset=['iPhone 13 Pro Max', 'iPhone 13', 'Pixel 5a', 'iPhone 13'],
Units=[50, 40, 10, 100],
Store=['Westfield UTC', 'Westfield UTC', 'Fashion Valley', 'Downtown']
)
# Phones on the market right now.
phones
Model | Price | Screen | |
---|---|---|---|
0 | iPhone 13 | 799 | 6.1 |
1 | iPhone 13 Pro Max | 1099 | 6.7 |
2 | Samsung Galaxy Z Flip | 999 | 6.7 |
3 | Pixel 5a | 449 | 6.3 |
# Which phones my stores have in stock in the area.
inventory
Handset | Units | Store | |
---|---|---|---|
0 | iPhone 13 Pro Max | 50 | Westfield UTC |
1 | iPhone 13 | 40 | Westfield UTC |
2 | Pixel 5a | 10 | Fashion Valley |
3 | iPhone 13 | 100 | Downtown |
phones.merge(inventory, left_on='Model', right_on='Handset')
Model | Price | Screen | Handset | Units | Store | |
---|---|---|---|---|---|---|
0 | iPhone 13 | 799 | 6.1 | iPhone 13 | 40 | Westfield UTC |
1 | iPhone 13 | 799 | 6.1 | iPhone 13 | 100 | Downtown |
2 | iPhone 13 Pro Max | 1099 | 6.7 | iPhone 13 Pro Max | 50 | Westfield UTC |
3 | Pixel 5a | 449 | 6.3 | Pixel 5a | 10 | Fashion Valley |
# Click through the presentation that appears.
merging_animation()
.merge
¶To "merge" two DataFrames:
left_df.merge(
right_df,
left_on='left_col_name',
right_on='right_col_name'
)
left_on
and right_on
should be column names (they don't have to be the same).phones
Model | Price | Screen | |
---|---|---|---|
0 | iPhone 13 | 799 | 6.1 |
1 | iPhone 13 Pro Max | 1099 | 6.7 |
2 | Samsung Galaxy Z Flip | 999 | 6.7 |
3 | Pixel 5a | 449 | 6.3 |
inventory
Handset | Units | Store | |
---|---|---|---|
0 | iPhone 13 Pro Max | 50 | Westfield UTC |
1 | iPhone 13 | 40 | Westfield UTC |
2 | Pixel 5a | 10 | Fashion Valley |
3 | iPhone 13 | 100 | Downtown |
# Notice there's no Samsung Galaxy Z Flip in phones_merged!
phones_merged = phones.merge(inventory, left_on='Model', right_on='Handset')
phones_merged
Model | Price | Screen | Handset | Units | Store | |
---|---|---|---|---|---|---|
0 | iPhone 13 | 799 | 6.1 | iPhone 13 | 40 | Westfield UTC |
1 | iPhone 13 | 799 | 6.1 | iPhone 13 | 100 | Downtown |
2 | iPhone 13 Pro Max | 1099 | 6.7 | iPhone 13 Pro Max | 50 | Westfield UTC |
3 | Pixel 5a | 449 | 6.3 | Pixel 5a | 10 | Fashion Valley |
(phones_merged.get('Price') * phones_merged.get('Units')).sum()
171300
phones.merge(inventory, left_on='Model', right_on='Handset')
Model | Price | Screen | Handset | Units | Store | |
---|---|---|---|---|---|---|
0 | iPhone 13 | 799 | 6.1 | iPhone 13 | 40 | Westfield UTC |
1 | iPhone 13 | 799 | 6.1 | iPhone 13 | 100 | Downtown |
2 | iPhone 13 Pro Max | 1099 | 6.7 | iPhone 13 Pro Max | 50 | Westfield UTC |
3 | Pixel 5a | 449 | 6.3 | Pixel 5a | 10 | Fashion Valley |
inventory.merge(phones, left_on='Handset', right_on='Model')
Handset | Units | Store | Model | Price | Screen | |
---|---|---|---|---|---|---|
0 | iPhone 13 Pro Max | 50 | Westfield UTC | iPhone 13 Pro Max | 1099 | 6.7 |
1 | iPhone 13 | 40 | Westfield UTC | iPhone 13 | 799 | 6.1 |
2 | iPhone 13 | 100 | Downtown | iPhone 13 | 799 | 6.1 |
3 | Pixel 5a | 10 | Fashion Valley | Pixel 5a | 449 | 6.3 |
Answer: The order of the rows and columns will be different, but the content will be the same.
Instead of using left_on='col'
and right_on='col'
, you can just say on='col'
.
phones
Model | Price | Screen | |
---|---|---|---|
0 | iPhone 13 | 799 | 6.1 |
1 | iPhone 13 Pro Max | 1099 | 6.7 |
2 | Samsung Galaxy Z Flip | 999 | 6.7 |
3 | Pixel 5a | 449 | 6.3 |
inventory_relabeled = inventory.assign(Model=inventory.get('Handset')).drop(columns=['Handset'])
inventory_relabeled
Units | Store | Model | |
---|---|---|---|
0 | 50 | Westfield UTC | iPhone 13 Pro Max |
1 | 40 | Westfield UTC | iPhone 13 |
2 | 10 | Fashion Valley | Pixel 5a |
3 | 100 | Downtown | iPhone 13 |
In this example, the column we want to merge on in both DataFrames is named 'Model'
, so we can just use on='Model'
.
phones.merge(inventory_relabeled, on='Model')
Model | Price | Screen | Units | Store | |
---|---|---|---|---|---|
0 | iPhone 13 | 799 | 6.1 | 40 | Westfield UTC |
1 | iPhone 13 | 799 | 6.1 | 100 | Downtown |
2 | iPhone 13 Pro Max | 1099 | 6.7 | 50 | Westfield UTC |
3 | Pixel 5a | 449 | 6.3 | 10 | Fashion Valley |
Notice: There's only one column containing phone names now.
Instead of using left_on
or right_on
, use left_index=True
or right_index=True
.
phones
Model | Price | Screen | |
---|---|---|---|
0 | iPhone 13 | 799 | 6.1 |
1 | iPhone 13 Pro Max | 1099 | 6.7 |
2 | Samsung Galaxy Z Flip | 999 | 6.7 |
3 | Pixel 5a | 449 | 6.3 |
inventory_by_handset = inventory.set_index('Handset')
inventory_by_handset
Units | Store | |
---|---|---|
Handset | ||
iPhone 13 Pro Max | 50 | Westfield UTC |
iPhone 13 | 40 | Westfield UTC |
Pixel 5a | 10 | Fashion Valley |
iPhone 13 | 100 | Downtown |
In this example, we want to merge using the 'Model'
column in the left DataFrame (phones
) and the index in the right DataFrame (inventory_by_handset
); note that 'Handset'
is not a column in the right DataFrame anymore.
phones.merge(inventory_by_handset, left_on='Model', right_index=True)
Model | Price | Screen | Units | Store | |
---|---|---|---|---|---|
0 | iPhone 13 | 799 | 6.1 | 40 | Westfield UTC |
0 | iPhone 13 | 799 | 6.1 | 100 | Downtown |
1 | iPhone 13 Pro Max | 1099 | 6.7 | 50 | Westfield UTC |
3 | Pixel 5a | 449 | 6.3 | 10 | Fashion Valley |
nice_weather_cities = bpd.DataFrame().assign(
city=['La Jolla', 'San Diego', 'Austin', 'Los Angeles'],
state=['California', 'California', 'Texas', 'California'],
today_high_temp=['79', '83', '87', '87']
)
schools = bpd.DataFrame().assign(
name=['UCSD', 'University of Chicago', 'University of San Diego','Johns Hopkins University', 'UT Austin', 'SDSU', 'UCLA'],
city=['La Jolla', 'Chicago', 'San Diego', 'Baltimore', 'Austin', 'San Diego', 'Los Angeles'],
state=['California', 'Illinois', 'California', 'Maryland', 'Texas', 'California', 'California'],
graduation_rate=[0.87, 0.94, 0.78, 0.92, 0.81, 0.83, 0.91 ]
)
Without writing code, how many rows are in nice_weather_cities.merge(schools, on='city')
?
A. 4 B. 5 C. 6 D. 7 E. 8
nice_weather_cities
city | state | today_high_temp | |
---|---|---|---|
0 | La Jolla | California | 79 |
1 | San Diego | California | 83 |
2 | Austin | Texas | 87 |
3 | Los Angeles | California | 87 |
schools
name | city | state | graduation_rate | |
---|---|---|---|---|
0 | UCSD | La Jolla | California | 0.87 |
1 | University of Chicago | Chicago | Illinois | 0.94 |
2 | University of San Diego | San Diego | California | 0.78 |
3 | Johns Hopkins University | Baltimore | Maryland | 0.92 |
4 | UT Austin | Austin | Texas | 0.81 |
5 | SDSU | San Diego | California | 0.83 |
6 | UCLA | Los Angeles | California | 0.91 |
Without writing code, how many rows are in nice_weather_cities.merge(schools, on='state')
?
nice_weather_cities
city | state | today_high_temp | |
---|---|---|---|
0 | La Jolla | California | 79 |
1 | San Diego | California | 83 |
2 | Austin | Texas | 87 |
3 | Los Angeles | California | 87 |
schools
name | city | state | graduation_rate | |
---|---|---|---|---|
0 | UCSD | La Jolla | California | 0.87 |
1 | University of Chicago | Chicago | Illinois | 0.94 |
2 | University of San Diego | San Diego | California | 0.78 |
3 | Johns Hopkins University | Baltimore | Maryland | 0.92 |
4 | UT Austin | Austin | Texas | 0.81 |
5 | SDSU | San Diego | California | 0.83 |
6 | UCLA | Los Angeles | California | 0.91 |
nice_weather_cities.merge(schools, on='state')
city_x | state | today_high_temp | name | city_y | graduation_rate | |
---|---|---|---|---|---|---|
0 | La Jolla | California | 79 | UCSD | La Jolla | 0.87 |
1 | La Jolla | California | 79 | University of San Diego | San Diego | 0.78 |
2 | La Jolla | California | 79 | SDSU | San Diego | 0.83 |
... | ... | ... | ... | ... | ... | ... |
10 | Los Angeles | California | 87 | SDSU | San Diego | 0.83 |
11 | Los Angeles | California | 87 | UCLA | Los Angeles | 0.91 |
12 | Austin | Texas | 87 | UT Austin | Austin | 0.81 |
13 rows × 6 columns
nice_weather_cities.merge(schools, on='state').shape[0]
13
Here are related exam problems to help you practice merging:
.groupby
..merge
..merge
, Python searches for a match between a specified column in each DataFrame and combines the rows with a match.if
-statements, to execute code only when certain conditions are met.for
-loops, to repeat code many times.