# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
%reload_ext pandas_tutor
%set_pandas_tutor_options {'projectorMode': True}
set_matplotlib_formats("svg")
plt.style.use('ggplot')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
from IPython.display import display, IFrame
def merging_animation():
src="https://docs.google.com/presentation/d/e/2PACX-1vSk2FfJ4K_An_CQwcN_Yu5unpJckOZjVQDFqZ78ZTTMmowUsCQKKVnum0_m6TaiGquQ44E3FiS9g2Y4/embed?start=false&loop=false&delayms=60000"
width=825
height=500
display(IFrame(src, width, height))
This question is best solved using if
-statements, which we won't formally cover until the next lecture. Here is the solution for part of this question.
def cut_numerical(cut):
if cut == 'Ideal':
return 5
if cut == 'Premium':
return 4
if cut == 'Very Good':
return 3
if cut == 'Good':
return 2
if cut == 'Fair':
return 1
The other two functions you need to write will be very similar, with one if
condition for each possible value.
roster = bpd.read_csv('data/roster-anon.csv')
roster
name | section | |
---|---|---|
0 | Anya Iatypd | 10AM |
1 | Nathaniel Kcyrfu | 11AM |
2 | Jae Oadpmw | 10AM |
... | ... | ... |
347 | Danny Zsoyxb | 10AM |
348 | Alex Lrmwwt | 11AM |
349 | Giovanni Ibkdsu | 11AM |
350 rows × 2 columns
Recall, last class, we extracted the first name of each student in the class.
def first_name(full_name):
'''Returns the first name given a full name.'''
return full_name.split(' ')[0]
roster = roster.assign(
first=roster.get('name').apply(first_name)
)
roster
name | section | first | |
---|---|---|---|
0 | Anya Iatypd | 10AM | Anya |
1 | Nathaniel Kcyrfu | 11AM | Nathaniel |
2 | Jae Oadpmw | 10AM | Jae |
... | ... | ... | ... |
347 | Danny Zsoyxb | 10AM | Danny |
348 | Alex Lrmwwt | 11AM | Alex |
349 | Giovanni Ibkdsu | 11AM | Giovanni |
350 rows × 3 columns
'Ryan'
are in each section?¶We discovered that 'Ryan'
is the most popular first name overall.
first_counts = (roster.groupby('first').count()
.sort_values('name', ascending=False)
.get(['name']))
To find the number of 'Ryan'
s in each lecture section, we can query for only the rows corresponding to 'Ryan'
s, and then group by 'section'
.
roster[roster.get('first') == 'Ryan'].groupby('section').count()
name | first | |
---|---|---|
section | ||
10AM | 2 | 2 |
11AM | 1 | 1 |
9AM | 3 | 3 |
But what if we want to know the number of 'Joseph'
s and 'Janice'
s per section, too?
roster[roster.get('first') == 'Joseph'].groupby('section').count()
name | first | |
---|---|---|
section | ||
10AM | 1 | 1 |
roster[roster.get('first') == 'Janice'].groupby('section').count()
name | first | |
---|---|---|
section | ||
11AM | 1 | 1 |
9AM | 1 | 1 |
Is there a way to do this for all first names and sections all at once?
'Ryan'
s in the 9AM section, the number of 'Janice'
s in the 11AM section.roster
name | section | first | |
---|---|---|---|
0 | Anya Iatypd | 10AM | Anya |
1 | Nathaniel Kcyrfu | 11AM | Nathaniel |
2 | Jae Oadpmw | 10AM | Jae |
... | ... | ... | ... |
347 | Danny Zsoyxb | 10AM | Danny |
348 | Alex Lrmwwt | 11AM | Alex |
349 | Giovanni Ibkdsu | 11AM | Giovanni |
350 rows × 3 columns
roster.groupby(['section', 'first']).count()
name | ||
---|---|---|
section | first | |
10AM | Aahil | 1 |
Aishwarya | 1 | |
Alejandro | 1 | |
... | ... | ... |
9AM | Zack | 1 |
Zeyuan | 1 | |
Zhanlin | 1 |
337 rows × 1 columns
The above DataFrame is telling us, for instance, that there is 1 student with the first name 'Zhanlin'
in the 9AM section.
It is not saying that there is only 1 'Zhanlin'
in the course overall. There could be more, just in other sections.
.groupby
with subgroups¶.groupby
:df.groupby([col_1, col_2, ..., col_k])
col_1
first.col_2
, and so on.'section'
and 'first'
..reset_index()
to "flatten" our DataFrame back to normal.roster.groupby(['section', 'first']).count().reset_index()
section | first | name | |
---|---|---|---|
0 | 10AM | Aahil | 1 |
1 | 10AM | Aishwarya | 1 |
2 | 10AM | Alejandro | 1 |
... | ... | ... | ... |
334 | 9AM | Zack | 1 |
335 | 9AM | Zeyuan | 1 |
336 | 9AM | Zhanlin | 1 |
337 rows × 3 columns
roster.groupby(['section', 'first']).count().reset_index()
section | first | name | |
---|---|---|---|
0 | 10AM | Aahil | 1 |
1 | 10AM | Aishwarya | 1 |
2 | 10AM | Alejandro | 1 |
... | ... | ... | ... |
334 | 9AM | Zack | 1 |
335 | 9AM | Zeyuan | 1 |
336 | 9AM | Zhanlin | 1 |
337 rows × 3 columns
roster.groupby(['first', 'section']).count().reset_index()
first | section | name | |
---|---|---|---|
0 | Aahil | 10AM | 1 |
1 | Abhay | 9AM | 1 |
2 | Aditi | 11AM | 1 |
... | ... | ... | ... |
334 | Zixuan | 10AM | 1 |
335 | Ziyao | 10AM | 1 |
336 | Zoe | 11AM | 1 |
337 rows × 3 columns
Answer: Kind of. The order of the rows and columns will be different, but the content will be the same.
Using counts
, find the lecture section with the most 'Giovanni'
s.
counts = roster.groupby(['section', 'first']).count().reset_index()
counts
section | first | name | |
---|---|---|---|
0 | 10AM | Aahil | 1 |
1 | 10AM | Aishwarya | 1 |
2 | 10AM | Alejandro | 1 |
... | ... | ... | ... |
334 | 9AM | Zack | 1 |
335 | 9AM | Zeyuan | 1 |
336 | 9AM | Zhanlin | 1 |
337 rows × 3 columns
...
Ellipsis
Using counts
, find the shortest first name in the class that is shared by at least two students in the same section.
...
Ellipsis
This dataset contains the sea surface temperature in La Jolla, on many days ranging from August 22, 1916 to December 31, 2021.
sea_temp = bpd.read_csv('data/sea_temp.csv')
sea_temp
YEAR | MONTH | DAY | SURFACE_TEMP | |
---|---|---|---|---|
0 | 1916 | 8 | 22 | 19.5 |
1 | 1916 | 8 | 23 | 19.9 |
2 | 1916 | 8 | 24 | 19.7 |
... | ... | ... | ... | ... |
37195 | 2021 | 12 | 29 | 15.0 |
37196 | 2021 | 12 | 30 | 15.1 |
37197 | 2021 | 12 | 31 | 15.4 |
37198 rows × 4 columns
We want to find the single month (e.g. November 1998) with the highest average 'SURFACE_TEMP'
.
Which of the following would help us achieve this goal?
A. sea_temp.groupby('SURFACE_TEMP').mean()
B. sea_temp.groupby('MONTH').mean()
C. sea_temp.groupby(['YEAR', 'MONTH']).mean()
D. sea_temp.groupby(['MONTH', 'DAY']).mean()
E. sea_temp.groupby(['MONTH', 'SURFACE_TEMP']).mean()
...
Ellipsis
(sea_temp
.groupby('MONTH')
.mean()
.plot(kind='line', y='SURFACE_TEMP')
);
(sea_temp
.groupby('YEAR')
.mean()
.plot(kind='line', y='SURFACE_TEMP')
);
.groupby
with subgroups¶.groupby
to make subgroups..reset_index()
after grouping with subgroups to move the MultiIndex back to the columns.phones = bpd.DataFrame().assign(
Model=['iPhone 13', 'iPhone 13 Pro Max', 'Samsung Galaxy Z Flip', 'Pixel 5a'],
Price=[799, 1099, 999, 449],
Screen=[6.1, 6.7, 6.7, 6.3]
)
inventory = bpd.DataFrame().assign(
Handset=['iPhone 13 Pro Max', 'iPhone 13', 'Pixel 5a', 'iPhone 13'],
Units=[50, 40, 10, 100],
Store=['Westfield UTC', 'Westfield UTC', 'Fashion Valley', 'Downtown']
)
# Phones on the market right now
phones
Model | Price | Screen | |
---|---|---|---|
0 | iPhone 13 | 799 | 6.1 |
1 | iPhone 13 Pro Max | 1099 | 6.7 |
2 | Samsung Galaxy Z Flip | 999 | 6.7 |
3 | Pixel 5a | 449 | 6.3 |
# Which phones my stores have in stock in the area
inventory
Handset | Units | Store | |
---|---|---|---|
0 | iPhone 13 Pro Max | 50 | Westfield UTC |
1 | iPhone 13 | 40 | Westfield UTC |
2 | Pixel 5a | 10 | Fashion Valley |
3 | iPhone 13 | 100 | Downtown |
Question: If I sell all of the phones in my inventory, how much will I make in revenue?
phones.merge(inventory, left_on='Model', right_on='Handset')
Model | Price | Screen | Handset | Units | Store | |
---|---|---|---|---|---|---|
0 | iPhone 13 | 799 | 6.1 | iPhone 13 | 40 | Westfield UTC |
1 | iPhone 13 | 799 | 6.1 | iPhone 13 | 100 | Downtown |
2 | iPhone 13 Pro Max | 1099 | 6.7 | iPhone 13 Pro Max | 50 | Westfield UTC |
3 | Pixel 5a | 449 | 6.3 | Pixel 5a | 10 | Fashion Valley |
# Click through the presentation that appears
merging_animation()
.merge
¶left_df.merge(
right_df,
left_on=left_column_name,
right_on=right_column_name
)
left_on
and right_on
should be column names (they don't have to be the same).%%pt
# Notice there's no Samsung Galaxy Z Flip in phones_merged
phones_merged = phones.merge(inventory, left_on='Model', right_on='Handset')
(phones_merged.get('Price') * phones_merged.get('Units')).sum()
171300
on
¶inventory_relabeled = inventory.assign(Model=inventory.get('Handset')).drop(columns=['Handset'])
inventory_relabeled
Units | Store | Model | |
---|---|---|---|
0 | 50 | Westfield UTC | iPhone 13 Pro Max |
1 | 40 | Westfield UTC | iPhone 13 |
2 | 10 | Fashion Valley | Pixel 5a |
3 | 100 | Downtown | iPhone 13 |
phones.merge(inventory_relabeled, on='Model')
Model | Price | Screen | Units | Store | |
---|---|---|---|---|---|
0 | iPhone 13 | 799 | 6.1 | 40 | Westfield UTC |
1 | iPhone 13 | 799 | 6.1 | 100 | Downtown |
2 | iPhone 13 Pro Max | 1099 | 6.7 | 50 | Westfield UTC |
3 | Pixel 5a | 449 | 6.3 | 10 | Fashion Valley |
Notice: There's only one column containing phone names now.
%%pt
inventory.merge(phones, left_on='Handset', right_on='Model')
Answer: The order of the rows and columns will be different, but the content will be the same.
Instead of using left_on
or right_on
, use left_index=True
or right_index=True
.
phones
Model | Price | Screen | |
---|---|---|---|
0 | iPhone 13 | 799 | 6.1 |
1 | iPhone 13 Pro Max | 1099 | 6.7 |
2 | Samsung Galaxy Z Flip | 999 | 6.7 |
3 | Pixel 5a | 449 | 6.3 |
inventory_by_handset = inventory.set_index('Handset')
inventory_by_handset
Units | Store | |
---|---|---|
Handset | ||
iPhone 13 Pro Max | 50 | Westfield UTC |
iPhone 13 | 40 | Westfield UTC |
Pixel 5a | 10 | Fashion Valley |
iPhone 13 | 100 | Downtown |
phones.merge(inventory_by_handset, left_on='Model', right_index=True)
Model | Price | Screen | Units | Store | |
---|---|---|---|---|---|
0 | iPhone 13 | 799 | 6.1 | 40 | Westfield UTC |
0 | iPhone 13 | 799 | 6.1 | 100 | Downtown |
1 | iPhone 13 Pro Max | 1099 | 6.7 | 50 | Westfield UTC |
3 | Pixel 5a | 449 | 6.3 | 10 | Fashion Valley |
nice_weather_cities = bpd.DataFrame().assign(
city=['La Jolla', 'San Diego', 'Austin', 'Los Angeles'],
state=['California', 'California', 'Texas', 'California'],
today_high_temp=['79', '83', '87', '87']
)
schools = bpd.DataFrame().assign(
name=['UCSD', 'University of Chicago', 'University of San Diego','Johns Hopkins University', 'UT Austin', 'SDSU', 'UCLA'],
city=['La Jolla', 'Chicago', 'San Diego', 'Baltimore', 'Austin', 'San Diego', 'Los Angeles'],
state=['California', 'Illinois', 'California', 'Maryland', 'Texas', 'California', 'California'],
graduation_rate=[0.87, 0.94, 0.78, 0.92, 0.81, 0.83, 0.91 ]
)
Without writing code, how many rows are in nice_weather_cities.merge(schools, on='city')
?
A. 4 B. 5 C. 6 D. 7 E. 8
nice_weather_cities
city | state | today_high_temp | |
---|---|---|---|
0 | La Jolla | California | 79 |
1 | San Diego | California | 83 |
2 | Austin | Texas | 87 |
3 | Los Angeles | California | 87 |
schools
name | city | state | graduation_rate | |
---|---|---|---|---|
0 | UCSD | La Jolla | California | 0.87 |
1 | University of Chicago | Chicago | Illinois | 0.94 |
2 | University of San Diego | San Diego | California | 0.78 |
3 | Johns Hopkins University | Baltimore | Maryland | 0.92 |
4 | UT Austin | Austin | Texas | 0.81 |
5 | SDSU | San Diego | California | 0.83 |
6 | UCLA | Los Angeles | California | 0.91 |
Without writing code, how many rows are in nice_weather_cities.merge(schools, on='state')
?
%%pt
nice_weather_cities.merge(schools, on='state')
nice_weather_cities.merge(schools, on='state').shape[0]
13
.groupby
..merge
..merge
, Python searches for a match between a specified column in each DataFrame and combines the rows with a match.