# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats("svg")
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (10, 5)

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

from IPython.display import HTML, display, IFrame


chapters = open('data/lw.txt').read().split('CHAPTER ')[1:]


# Counts of names in the chapters of Little Women

counts = bpd.DataFrame().assign(
    Amy=np.char.count(chapters, 'Amy'),
    Beth=np.char.count(chapters, 'Beth'),
    Jo=np.char.count(chapters, 'Jo'),
    Meg=np.char.count(chapters, 'Meg'),
    Laurie=np.char.count(chapters, 'Laurie'),
)

# cumulative number of times each name appears

lw_counts = bpd.DataFrame().assign(
    Amy=np.cumsum(counts.get('Amy')),
    Beth=np.cumsum(counts.get('Beth')),
    Jo=np.cumsum(counts.get('Jo')),
    Meg=np.cumsum(counts.get('Meg')),
    Laurie=np.cumsum(counts.get('Laurie')),
    Chapter=np.arange(1, 48, 1)
)

lw_counts


lw_counts.plot(x='Chapter');


actors = bpd.read_csv('data/actors.csv').set_index('Actor')
actors


actors.plot(kind='scatter', x='Number of Movies', y='Total Gross');


actors.plot(kind='scatter', x='Number of Movies', y='Average per Movie');


actors[actors.get('Number of Movies') >= 60]


actors[actors.get('Number of Movies') < 10]


movies_by_year = bpd.read_csv('data/movies_by_year.csv').set_index('Year')
movies_by_year


movies_by_year.plot(kind='line', y='Number of Movies');


movies_by_year.plot(kind='line', y='Number of Movies');


movies_by_year[movies_by_year.index >= 2000].plot(kind='line', y='Number of Movies');


movies_by_year[movies_by_year.index >= 2000].plot(kind='line', y='Total Gross in Billions');

...

Ellipsis


charts = (bpd.read_csv('data/regional-us-daily-2023-01-21.csv')
          .set_index('rank')
          .get(['track_name', 'artist_names', 'streams', 'uri'])
         )
charts


charts


charts.take(np.arange(10))


charts.take(np.arange(10)).plot(kind='barh', x='track_name', y='streams');


# The bars appear in the opposite order relative to the DataFrame
(charts
 .take(np.arange(10))
 .sort_values(by='streams')
 .plot(kind='barh', x='track_name', y='streams')
);


charts


songs_per_artist = charts.groupby('artist_names').count()
songs_per_artist


top_15_artists = (songs_per_artist
                  .sort_values('streams', ascending=False)
                  .take(np.arange(15)))
top_15_artists


# If we give .get a list, it will return a DataFrame instead of a Series!
top_15_artists = (top_15_artists
                  .assign(count=top_15_artists.get('streams'))
                  .get(['count']))
top_15_artists


top_15_artists.sort_values(by='count').plot(kind='barh', y='count');


top_15_artists.plot(kind='bar', y='count');


(charts
 [charts.get('artist_names') == 'The Weeknd']
 .sort_values('streams')
 .plot(kind='barh', x='track_name', y='streams')
);


weeknd = charts[charts.get('artist_names').str.contains('The Weeknd')]
weeknd


weeknd.sort_values('streams').plot(kind='barh', x='track_name', y='streams');


# Run this cell, don't worry about what it does.
def show_spotify(uri):
    code = uri[uri.rfind(':')+1:]
    src = f"https://open.spotify.com/embed/track/{code}"
    width = 400
    height = 75
    display(IFrame(src, width, height))


charts


favorite_song = 'Bejeweled'


song_uri = (charts
            [charts.get('track_name') == favorite_song]
            .get('uri')
            .iloc[0])
song_uri

'spotify:track:3qoftcUZaUOncvIYjFSPdE'


show_spotify(song_uri)

Action	Keyboard shortcut
Run cell + jump to next cell	SHIFT + ENTER
Save the notebook	CTRL/CMD + S
Create new cell above/below	A/B
Delete cell	DD

	Amy	Beth	Jo	Meg	Laurie	Chapter
0	23	26	44	26	0	1
1	36	38	65	46	0	2
2	38	40	127	82	16	3
...	...	...	...	...	...	...
44	633	461	1450	675	581	45
45	635	462	1506	679	583	46
46	645	465	1543	685	596	47

	Total Gross	Number of Movies	Average per Movie	#1 Movie	Gross
Actor
Harrison Ford	4871.7	41	118.8	Star Wars: The Force Awakens	936.7
Samuel L. Jackson	4772.8	69	69.2	The Avengers	623.4
Morgan Freeman	4468.3	61	73.3	The Dark Knight	534.9
...	...	...	...	...	...
Sandra Bullock	2462.6	35	70.4	Minions	336.0
Chris Evans	2457.8	23	106.9	The Avengers	623.4
Anne Hathaway	2416.5	25	96.7	The Dark Knight Rises	448.1

	Total Gross	Number of Movies	Average per Movie	#1 Movie	Gross
Actor
Samuel L. Jackson	4772.8	69	69.2	The Avengers	623.4
Morgan Freeman	4468.3	61	73.3	The Dark Knight	534.9
Bruce Willis	3189.4	60	53.2	Sixth Sense	293.5
Robert DeNiro	3081.3	79	39.0	Meet the Fockers	279.3
Liam Neeson	2942.7	63	46.7	The Phantom Menace	474.5

	Total Gross	Number of Movies	Average per Movie	#1 Movie	Gross
Actor
Anthony Daniels	3162.9	7	451.8	Star Wars: The Force Awakens	936.7

	Total Gross in Billions	Number of Movies	#1 Movie
Year
2022	5.64	380	Top Gun: Maverick
2021	4.48	439	Spider-Man: No Way Home
2020	2.11	456	Bad Boys for Life
...	...	...	...
1979	1.23	40	Superman
1978	0.83	13	Grease
1977	0.44	9	Star Wars: Episode IV - A New Hope

	track_name	artist_names	streams	uri
rank
1	Flowers	Miley Cyrus	3356361	spotify:track:0yLdNVWF3Srea0uzk55zFn
2	Kill Bill	SZA	2479445	spotify:track:1Qrg8KqiBpW07V7PNxwwwL
3	Creepin' (with The Weeknd & 21 Savage)	Metro Boomin, The Weeknd, 21 Savage	1337320	spotify:track:2dHHgzDwk4BJdRwy9uXhTO
...	...	...	...	...
198	Major Distribution	Drake, 21 Savage	266986	spotify:track:46s57QULU02Voy0Kup6UEb
199	Sun to Me	Zach Bryan	266968	spotify:track:1SjsVdSXpwm1kTdYEHoPIT
200	The Real Slim Shady	Eminem	266698	spotify:track:3yfqSUWxFvZELEM4PmlwIR

	track_name	streams	uri
artist_names
21 Savage, Metro Boomin	1	1	1
80purppp	1	1	1
A Boogie Wit da Hoodie	1	1	1
...	...	...	...
Zach Bryan	4	4	4
d4vd	2	2	2
Ñengo Flow, Bad Bunny	1	1	1

	track_name	streams	uri
artist_names
SZA	11	11	11
Taylor Swift	8	8	8
Morgan Wallen	6	6	6
...	...	...	...
Kanye West	2	2	2
Childish Gambino	2	2	2
NewJeans	2	2	2

Lecture 6 – Data Visualization 📈¶

DSC 10, Winter 2023¶

Announcements¶

Aside: keyboard shortcuts¶

Agenda¶

Why visualize?¶

Little Women¶

Napoleon's March¶

Why visualize?¶

Terminology¶

Individuals and variables¶

Types of variables¶

Examples of numerical variables¶

Examples of categorical variables¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Types of visualizations¶

Scatter plots¶

Dataset of 50 top-grossing actors¶

Scatter plots¶

Scatter plots¶

Scatter plots¶

Who was in 60 or more movies?¶

Who is the outlier?¶

Anthony Daniels¶

Line plots 📉¶

Dataset aggregating movies by year¶

Line plots¶

Line plots¶

Plotting tip¶

Since the year 2000¶

How did this affect total gross?¶

What was the top grossing movie of 2016? 🐟¶

Bar charts 📊¶

Dataset of the top 200 songs in the US on Spotify as of Saturday (1/21/23)¶

Bar charts¶

Bar charts¶

How many songs do the top 15 artists have in the top 200?¶

Vertical bar charts¶

Aside: How many streams did The Weeknd's songs on the chart receive?¶

How do we include featured songs, as well?¶

Fun demo 🎵¶

Let's find the URI of a song we care about.¶

Bad visualizations¶

Summary¶

Summary¶

Concept Check ✅ – Answer at cc.dsc10.com ¶