# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats("svg")
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (10, 5)
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
from IPython.display import HTML, display, IFrame
There are several keyboard shortcuts built into Jupyter Notebooks designed to help you save time. To see them, either click the keyboard button in the toolbar above or hit the H key on your keyboard (as long as you're not actively editing a cell).
Particularly useful shortcuts:
Action | Keyboard shortcut |
---|---|
Run cell + jump to next cell | SHIFT + ENTER |
Save the notebook | CTRL/CMD + S |
Create new cell above/below | A/B |
Delete cell | DD |
Don't forget about the DSC 10 Reference Sheet and the Resources tab of the course website!
Run these cells to load the Little Women data from Lecture 1.
chapters = open('data/lw.txt').read().split('CHAPTER ')[1:]
# Counts of names in the chapters of Little Women
counts = bpd.DataFrame().assign(
Amy=np.char.count(chapters, 'Amy'),
Beth=np.char.count(chapters, 'Beth'),
Jo=np.char.count(chapters, 'Jo'),
Meg=np.char.count(chapters, 'Meg'),
Laurie=np.char.count(chapters, 'Laurie'),
)
# cumulative number of times each name appears
lw_counts = bpd.DataFrame().assign(
Amy=np.cumsum(counts.get('Amy')),
Beth=np.cumsum(counts.get('Beth')),
Jo=np.cumsum(counts.get('Jo')),
Meg=np.cumsum(counts.get('Meg')),
Laurie=np.cumsum(counts.get('Laurie')),
Chapter=np.arange(1, 48, 1)
)
lw_counts
Amy | Beth | Jo | Meg | Laurie | Chapter | |
---|---|---|---|---|---|---|
0 | 23 | 26 | 44 | 26 | 0 | 1 |
1 | 36 | 38 | 65 | 46 | 0 | 2 |
2 | 38 | 40 | 127 | 82 | 16 | 3 |
... | ... | ... | ... | ... | ... | ... |
44 | 633 | 461 | 1450 | 675 | 581 | 45 |
45 | 635 | 462 | 1506 | 679 | 583 | 46 |
46 | 645 | 465 | 1543 | 685 | 596 | 47 |
47 rows × 6 columns
In Lecture 1, we were able to answer questions about the plot of Little Women without having to read the novel and without having to understand Python code. Some of those questions included:
We answered these questions from a data visualization alone!
lw_counts.plot(x='Chapter');
There are two main types of variables:
Which of these is not a numerical variable?
A. Fuel economy in miles per gallon.
B. Number of quarters at UCSD.
C. College at UCSD (Sixth, Seventh, etc).
D. Bank account number.
E. More than one of these are not numerical variables.
The type of visualization we create depends on the kinds of variables we're visualizing.
Note: We may interchange the words "plot", "chart", and "graph"; they all mean the same thing.
Column | Contents |
---|---|
'Actor'
|Name of actor
'Total Gross'
| Total gross domestic box office receipt, in millions of dollars, of all of the actor’s movies
'Number of Movies'
| The number of movies the actor has been in
'Average per Movie'
| Total gross divided by number of movies
'#1 Movie'
| The highest grossing movie the actor has been in
'Gross'
| Gross domestic box office receipt, in millions of dollars, of the actor’s #1 Movie
actors = bpd.read_csv('data/actors.csv').set_index('Actor')
actors
Total Gross | Number of Movies | Average per Movie | #1 Movie | Gross | |
---|---|---|---|---|---|
Actor | |||||
Harrison Ford | 4871.7 | 41 | 118.8 | Star Wars: The Force Awakens | 936.7 |
Samuel L. Jackson | 4772.8 | 69 | 69.2 | The Avengers | 623.4 |
Morgan Freeman | 4468.3 | 61 | 73.3 | The Dark Knight | 534.9 |
... | ... | ... | ... | ... | ... |
Sandra Bullock | 2462.6 | 35 | 70.4 | Minions | 336.0 |
Chris Evans | 2457.8 | 23 | 106.9 | The Avengers | 623.4 |
Anne Hathaway | 2416.5 | 25 | 96.7 | The Dark Knight Rises | 448.1 |
50 rows × 5 columns
What is the relationship between 'Number of Movies'
and 'Total Gross'
?
actors.plot(kind='scatter', x='Number of Movies', y='Total Gross');
df
, use
df.plot(
kind='scatter',
x=x_column_for_horizontal,
y=y_column_for_vertical
)
df
..plot
, it will hide the weird text output that displays.What is the relationship between 'Number of Movies'
and 'Average per Movie'
?
actors.plot(kind='scatter', x='Number of Movies', y='Average per Movie');
Note that in the above plot, there's a negative association and an outlier.
actors[actors.get('Number of Movies') >= 60]
Total Gross | Number of Movies | Average per Movie | #1 Movie | Gross | |
---|---|---|---|---|---|
Actor | |||||
Samuel L. Jackson | 4772.8 | 69 | 69.2 | The Avengers | 623.4 |
Morgan Freeman | 4468.3 | 61 | 73.3 | The Dark Knight | 534.9 |
Bruce Willis | 3189.4 | 60 | 53.2 | Sixth Sense | 293.5 |
Robert DeNiro | 3081.3 | 79 | 39.0 | Meet the Fockers | 279.3 |
Liam Neeson | 2942.7 | 63 | 46.7 | The Phantom Menace | 474.5 |
Whoever they are, they made very few, high grossing movies.
actors[actors.get('Number of Movies') < 10]
Total Gross | Number of Movies | Average per Movie | #1 Movie | Gross | |
---|---|---|---|---|---|
Actor | |||||
Anthony Daniels | 3162.9 | 7 | 451.8 | Star Wars: The Force Awakens | 936.7 |
Column | Content |
---|---|
'Year'
| Year
'Total Gross in Billions'
| Total domestic box office gross, in billions of dollars, of all movies released
'Number of Movies'
| Number of movies released
'#1 Movie'
| Highest grossing movie
movies_by_year = bpd.read_csv('data/movies_by_year.csv').set_index('Year')
movies_by_year
Total Gross in Billions | Number of Movies | #1 Movie | |
---|---|---|---|
Year | |||
2022 | 5.64 | 380 | Top Gun: Maverick |
2021 | 4.48 | 439 | Spider-Man: No Way Home |
2020 | 2.11 | 456 | Bad Boys for Life |
... | ... | ... | ... |
1979 | 1.23 | 40 | Superman |
1978 | 0.83 | 13 | Grease |
1977 | 0.44 | 9 | Star Wars: Episode IV - A New Hope |
46 rows × 3 columns
How has the number of movies changed over time? 🤔
movies_by_year.plot(kind='line', y='Number of Movies');
df
, use
df.plot(
kind='line',
x=x_column_for_horizontal,
y=y_column_for_vertical
)
x=
argument!movies_by_year.plot(kind='line', y='Number of Movies');
We can create a line plot of just 2000 onwards by querying movies_by_year
before calling .plot
.
movies_by_year[movies_by_year.index >= 2000].plot(kind='line', y='Number of Movies');
What do you think explains the declines around 2008 and 2020?
movies_by_year[movies_by_year.index >= 2000].plot(kind='line', y='Total Gross in Billions');
...
Ellipsis
charts = (bpd.read_csv('data/regional-us-daily-2023-01-21.csv')
.set_index('rank')
.get(['track_name', 'artist_names', 'streams', 'uri'])
)
charts
track_name | artist_names | streams | uri | |
---|---|---|---|---|
rank | ||||
1 | Flowers | Miley Cyrus | 3356361 | spotify:track:0yLdNVWF3Srea0uzk55zFn |
2 | Kill Bill | SZA | 2479445 | spotify:track:1Qrg8KqiBpW07V7PNxwwwL |
3 | Creepin' (with The Weeknd & 21 Savage) | Metro Boomin, The Weeknd, 21 Savage | 1337320 | spotify:track:2dHHgzDwk4BJdRwy9uXhTO |
... | ... | ... | ... | ... |
198 | Major Distribution | Drake, 21 Savage | 266986 | spotify:track:46s57QULU02Voy0Kup6UEb |
199 | Sun to Me | Zach Bryan | 266968 | spotify:track:1SjsVdSXpwm1kTdYEHoPIT |
200 | The Real Slim Shady | Eminem | 266698 | spotify:track:3yfqSUWxFvZELEM4PmlwIR |
200 rows × 4 columns
How many streams do the top 10 songs have?
charts
track_name | artist_names | streams | uri | |
---|---|---|---|---|
rank | ||||
1 | Flowers | Miley Cyrus | 3356361 | spotify:track:0yLdNVWF3Srea0uzk55zFn |
2 | Kill Bill | SZA | 2479445 | spotify:track:1Qrg8KqiBpW07V7PNxwwwL |
3 | Creepin' (with The Weeknd & 21 Savage) | Metro Boomin, The Weeknd, 21 Savage | 1337320 | spotify:track:2dHHgzDwk4BJdRwy9uXhTO |
... | ... | ... | ... | ... |
198 | Major Distribution | Drake, 21 Savage | 266986 | spotify:track:46s57QULU02Voy0Kup6UEb |
199 | Sun to Me | Zach Bryan | 266968 | spotify:track:1SjsVdSXpwm1kTdYEHoPIT |
200 | The Real Slim Shady | Eminem | 266698 | spotify:track:3yfqSUWxFvZELEM4PmlwIR |
200 rows × 4 columns
charts.take(np.arange(10))
track_name | artist_names | streams | uri | |
---|---|---|---|---|
rank | ||||
1 | Flowers | Miley Cyrus | 3356361 | spotify:track:0yLdNVWF3Srea0uzk55zFn |
2 | Kill Bill | SZA | 2479445 | spotify:track:1Qrg8KqiBpW07V7PNxwwwL |
3 | Creepin' (with The Weeknd & 21 Savage) | Metro Boomin, The Weeknd, 21 Savage | 1337320 | spotify:track:2dHHgzDwk4BJdRwy9uXhTO |
... | ... | ... | ... | ... |
8 | Anti-Hero | Taylor Swift | 936166 | spotify:track:0V3wPSX9ygBnCm8psDIegu |
9 | golden hour | JVKE | 870031 | spotify:track:5odlY52u43F5BjByhxg7wg |
10 | Unholy (feat. Kim Petras) | Sam Smith, Kim Petras | 859271 | spotify:track:3nqQXoyQOWXiESFLlDF1hG |
10 rows × 4 columns
charts.take(np.arange(10)).plot(kind='barh', x='track_name', y='streams');
df
, use
df.plot(
kind='barh',
x=categorical_column_name,
y=numerical_column_name
)
'barh'
stands for "horizontal".y='Streams'
even though streams are measured by x-axis length.# The bars appear in the opposite order relative to the DataFrame
(charts
.take(np.arange(10))
.sort_values(by='streams')
.plot(kind='barh', x='track_name', y='streams')
);
First, let's create a DataFrame with a single column that describes the number of songs in the top 200 per artist. This involves using .groupby
with .count()
. Since we want one row per artist, we will group by 'artist_names'
.
charts
track_name | artist_names | streams | uri | |
---|---|---|---|---|
rank | ||||
1 | Flowers | Miley Cyrus | 3356361 | spotify:track:0yLdNVWF3Srea0uzk55zFn |
2 | Kill Bill | SZA | 2479445 | spotify:track:1Qrg8KqiBpW07V7PNxwwwL |
3 | Creepin' (with The Weeknd & 21 Savage) | Metro Boomin, The Weeknd, 21 Savage | 1337320 | spotify:track:2dHHgzDwk4BJdRwy9uXhTO |
... | ... | ... | ... | ... |
198 | Major Distribution | Drake, 21 Savage | 266986 | spotify:track:46s57QULU02Voy0Kup6UEb |
199 | Sun to Me | Zach Bryan | 266968 | spotify:track:1SjsVdSXpwm1kTdYEHoPIT |
200 | The Real Slim Shady | Eminem | 266698 | spotify:track:3yfqSUWxFvZELEM4PmlwIR |
200 rows × 4 columns
songs_per_artist = charts.groupby('artist_names').count()
songs_per_artist
track_name | streams | uri | |
---|---|---|---|
artist_names | |||
21 Savage, Metro Boomin | 1 | 1 | 1 |
80purppp | 1 | 1 | 1 |
A Boogie Wit da Hoodie | 1 | 1 | 1 |
... | ... | ... | ... |
Zach Bryan | 4 | 4 | 4 |
d4vd | 2 | 2 | 2 |
Ñengo Flow, Bad Bunny | 1 | 1 | 1 |
145 rows × 3 columns
Using .sort_values
and .take
, we'll keep just the top 15 artists. Note that all columns in songs_per_artist
contain the same information (this is a consequence of using .count()
).
top_15_artists = (songs_per_artist
.sort_values('streams', ascending=False)
.take(np.arange(15)))
top_15_artists
track_name | streams | uri | |
---|---|---|---|
artist_names | |||
SZA | 11 | 11 | 11 |
Taylor Swift | 8 | 8 | 8 |
Morgan Wallen | 6 | 6 | 6 |
... | ... | ... | ... |
Kanye West | 2 | 2 | 2 |
Childish Gambino | 2 | 2 | 2 |
NewJeans | 2 | 2 | 2 |
15 rows × 3 columns
Using .assign
and .drop
, we'll create a column named 'count'
that contains the same information that the other 3 columns contain, and then .get
only that column (or equivalently, drop the other 3 columns).
# If we give .get a list, it will return a DataFrame instead of a Series!
top_15_artists = (top_15_artists
.assign(count=top_15_artists.get('streams'))
.get(['count']))
top_15_artists
count | |
---|---|
artist_names | |
SZA | 11 |
Taylor Swift | 8 |
Morgan Wallen | 6 |
... | ... |
Kanye West | 2 |
Childish Gambino | 2 |
NewJeans | 2 |
15 rows × 1 columns
Before calling .plot(kind='barh', y='count')
, we'll sort top_15_artists
by 'count'
in increasing order. This is because, weirdly, Python reverses the order of rows when creating bars in horizontal bar charts.
top_15_artists.sort_values(by='count').plot(kind='barh', y='count');
To create a vertical bar chart, use kind='bar'
instead of kind='barh'
. These are typically harder to read, though.
top_15_artists.plot(kind='bar', y='count');
(charts
[charts.get('artist_names') == 'The Weeknd']
.sort_values('streams')
.plot(kind='barh', x='track_name', y='streams')
);
It seems like we're missing some popular songs...
Answer: Using .str.contains
.
weeknd = charts[charts.get('artist_names').str.contains('The Weeknd')]
weeknd
track_name | artist_names | streams | uri | |
---|---|---|---|---|
rank | ||||
3 | Creepin' (with The Weeknd & 21 Savage) | Metro Boomin, The Weeknd, 21 Savage | 1337320 | spotify:track:2dHHgzDwk4BJdRwy9uXhTO |
13 | Die For You | The Weeknd | 794924 | spotify:track:2LBqCSwhJGcFQeTHMVGwy3 |
76 | Stargirl Interlude | The Weeknd, Lana Del Rey | 372624 | spotify:track:5gDWsRxpJ2lZAffh5p7K0w |
... | ... | ... | ... | ... |
110 | I Was Never There | The Weeknd, Gesaffelstein | 328724 | spotify:track:1cKHdTo9u0ZymJdPGSh6nq |
128 | Blinding Lights | The Weeknd | 311176 | spotify:track:0VjIjW4GlUZAMYd2vXMi3b |
168 | Call Out My Name | The Weeknd | 281141 | spotify:track:09mEdoA6zrmBPgTEN5qXmN |
8 rows × 4 columns
weeknd.sort_values('streams').plot(kind='barh', x='track_name', y='streams');
# Run this cell, don't worry about what it does.
def show_spotify(uri):
code = uri[uri.rfind(':')+1:]
src = f"https://open.spotify.com/embed/track/{code}"
width = 400
height = 75
display(IFrame(src, width, height))
charts
track_name | artist_names | streams | uri | |
---|---|---|---|---|
rank | ||||
1 | Flowers | Miley Cyrus | 3356361 | spotify:track:0yLdNVWF3Srea0uzk55zFn |
2 | Kill Bill | SZA | 2479445 | spotify:track:1Qrg8KqiBpW07V7PNxwwwL |
3 | Creepin' (with The Weeknd & 21 Savage) | Metro Boomin, The Weeknd, 21 Savage | 1337320 | spotify:track:2dHHgzDwk4BJdRwy9uXhTO |
... | ... | ... | ... | ... |
198 | Major Distribution | Drake, 21 Savage | 266986 | spotify:track:46s57QULU02Voy0Kup6UEb |
199 | Sun to Me | Zach Bryan | 266968 | spotify:track:1SjsVdSXpwm1kTdYEHoPIT |
200 | The Real Slim Shady | Eminem | 266698 | spotify:track:3yfqSUWxFvZELEM4PmlwIR |
200 rows × 4 columns
favorite_song = 'Bejeweled'
song_uri = (charts
[charts.get('track_name') == favorite_song]
.get('uri')
.iloc[0])
song_uri
'spotify:track:3qoftcUZaUOncvIYjFSPdE'
Watch what happens! 🎶
show_spotify(song_uri)
Try it out yourself!