# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats("svg")
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (10, 5)

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

from IPython.display import display, IFrame, YouTubeVideo

def show_grouping_animation():
    src = "https://docs.google.com/presentation/d/e/2PACX-1vTgVlFngQcLMYHP-z1vq5lVXjsBgcHebc-3TX7SW6L_gjX6TD1gsflvVDQUpWiDdeEPqJASenUIfBVd/embed?start=false&loop=false&delayms=60000"
    width = 960
    height = 509
    display(IFrame(src, width, height))
    
import warnings
warnings.simplefilter('ignore')


show_grouping_animation()


requests = bpd.read_csv('data/get-it-done-requests.csv')
requests = requests.assign(total=requests.get('closed') + requests.get('open'))
requests


requests


requests.groupby('neighborhood').sum()


# Note the use of .index – remember, the index isn't a column!
(
    requests
    .groupby('neighborhood')
    .sum()
    .sort_values(by='total', ascending=False)
    .index[0]
)

'Downtown'


# How many different requests are there for the neighborhood 'University'?
requests[requests.get('neighborhood') == 'University']


# How do we find this result for every neighborhood?


num_diff_services = requests.groupby('neighborhood').count()
num_diff_services


num_diff_services = num_diff_services.assign(
                    count_of_services=num_diff_services.get('open')
                    ).drop(columns=['service', 'closed', 'open', 'total'])
num_diff_services


chapters = open('data/lw.txt').read().split('CHAPTER ')[1:]


# Counts of names in the chapters of Little Women.

counts = bpd.DataFrame().assign(
    Amy=np.char.count(chapters, 'Amy'),
    Beth=np.char.count(chapters, 'Beth'),
    Jo=np.char.count(chapters, 'Jo'),
    Meg=np.char.count(chapters, 'Meg'),
    Laurie=np.char.count(chapters, 'Laurie'),
)

# Cumulative number of times each name appears.

lw_counts = bpd.DataFrame().assign(
    Amy=np.cumsum(counts.get('Amy')),
    Beth=np.cumsum(counts.get('Beth')),
    Jo=np.cumsum(counts.get('Jo')),
    Meg=np.cumsum(counts.get('Meg')),
    Laurie=np.cumsum(counts.get('Laurie')),
    Chapter=np.arange(1, 48, 1)
)

lw_counts


lw_counts.plot(x='Chapter');


actors = bpd.read_csv('data/actors.csv').set_index('Actor')
actors


actors.plot(kind='scatter', x='Number of Movies', y='Total Gross');


actors.plot(kind='scatter', x='Number of Movies', y='Average per Movie');


actors[actors.get('Number of Movies') >= 60]


actors[actors.get('Number of Movies') < 10]


movies_by_year = bpd.read_csv('data/movies_by_year.csv').set_index('Year')
movies_by_year


movies_by_year.plot(kind='line', y='Number of Movies');


movies_by_year.plot(kind='line', y='Number of Movies');


movies_by_year[movies_by_year.index >= 2000].plot(kind='line', y='Number of Movies');


movies_by_year[movies_by_year.index >= 2000].plot(kind='line', y='Total Gross in Billions');

...

Ellipsis


YouTubeVideo('glzZ04D1kDg')


charts = (bpd.read_csv('data/regional-us-daily-2023-04-13.csv')
          .set_index('rank')
          .get(['track_name', 'artist_names', 'streams', 'uri'])
         )
charts


charts


charts.take(np.arange(10))


charts.take(np.arange(10)).plot(kind='barh', x='track_name', y='streams');


# The bars appear in the opposite order relative to the DataFrame.
(charts
 .take(np.arange(10))
 .sort_values(by='streams')
 .plot(kind='barh', x='track_name', y='streams')
);


# Change "barh" to "bar" to get a vertical bar chart. These are a little harder to read.
(charts
 .take(np.arange(10))
 .sort_values(by='streams')
 .plot(kind='bar', x='track_name', y='streams')
);


(charts
 [charts.get('artist_names') == 'The Weeknd']
 .sort_values('streams')
 .plot(kind='barh', x='track_name', y='streams')
);


weeknd = charts[charts.get('artist_names').str.contains('The Weeknd')]
weeknd


weeknd.sort_values('streams').plot(kind='barh', x='track_name', y='streams');


# Run this cell, don't worry about what it does.
def show_spotify(uri):
    code = uri[uri.rfind(':')+1:]
    src = f"https://open.spotify.com/embed/track/{code}"
    width = 400
    height = 75
    display(IFrame(src, width, height))


charts


favorite_song = 'Die For You (with Ariana Grande) - Remix'


song_uri = (charts
            [charts.get('track_name') == favorite_song]
            .get('uri')
            .iloc[0])
song_uri

'spotify:track:4W4fNrZYkobj539TOWsLO2'


show_spotify(song_uri)

Action	Keyboard shortcut
Run cell + jump to next cell	SHIFT + ENTER
Save the notebook	CTRL/CMD + S
Create new cell above/below	A/B
Delete cell	DD

	neighborhood	service	closed	open	total
0	Balboa Park	Dead Animal	11	0	11
1	Balboa Park	Development Services - Code Enforcement	2	0	2
2	Balboa Park	Encampment	215	20	235
...	...	...	...	...	...
1418	Via De La Valle	Pothole	11	7	18
1419	Via De La Valle	Sidewalk Repair Issue	0	1	1
1420	Via De La Valle	Street Sweeping	1	0	1

	neighborhood	service	closed	open	total
0	Balboa Park	Dead Animal	11	0	11
1	Balboa Park	Development Services - Code Enforcement	2	0	2
2	Balboa Park	Encampment	215	20	235
...	...	...	...	...	...
1418	Via De La Valle	Pothole	11	7	18
1419	Via De La Valle	Sidewalk Repair Issue	0	1	1
1420	Via De La Valle	Street Sweeping	1	0	1

	closed	open	total
neighborhood
Balboa Park	1173	261	1434
Barrio Logan	1185	201	1386
Black Mountain Ranch	151	44	195
...	...	...	...
University	1614	620	2234
Uptown	5654	1691	7345
Via De La Valle	13	12	25

	neighborhood	service	closed	open	total
1354	University	Dead Animal	25	0	25
1355	University	Development Services - Code Enforcement	7	2	9
1356	University	Encampment	55	27	82
...	...	...	...	...	...
1381	University	Tree Maintenance	47	8	55
1382	University	Waste on Private Property	5	1	6
1383	University	Weed Cleanup	1	4	5

	Amy	Beth	Jo	Meg	Laurie	Chapter
0	23	26	44	26	0	1
1	36	38	65	46	0	2
2	38	40	127	82	16	3
...	...	...	...	...	...	...
44	633	461	1450	675	581	45
45	635	462	1506	679	583	46
46	645	465	1543	685	596	47

	Total Gross	Number of Movies	Average per Movie	#1 Movie	Gross
Actor
Harrison Ford	4871.7	41	118.8	Star Wars: The Force Awakens	936.7
Samuel L. Jackson	4772.8	69	69.2	The Avengers	623.4
Morgan Freeman	4468.3	61	73.3	The Dark Knight	534.9
...	...	...	...	...	...
Sandra Bullock	2462.6	35	70.4	Minions	336.0
Chris Evans	2457.8	23	106.9	The Avengers	623.4
Anne Hathaway	2416.5	25	96.7	The Dark Knight Rises	448.1

	Total Gross	Number of Movies	Average per Movie	#1 Movie	Gross
Actor
Anthony Daniels	3162.9	7	451.8	Star Wars: The Force Awakens	936.7

	Total Gross in Billions	Number of Movies	#1 Movie
Year
2022	5.64	380	Top Gun: Maverick
2021	4.48	439	Spider-Man: No Way Home
2020	2.11	456	Bad Boys for Life
...	...	...	...
1979	1.23	40	Superman
1978	0.83	13	Grease
1977	0.44	9	Star Wars: Episode IV - A New Hope

	track_name	artist_names	streams	uri
rank
1	Last Night	Morgan Wallen	1801636	spotify:track:7K3BhSpAxZBznislvUMVtn
2	Search & Rescue	Drake	1515162	spotify:track:7aRCf5cLOFN1U7kvtChY1G
3	Kill Bill	SZA	1412326	spotify:track:1Qrg8KqiBpW07V7PNxwwwL
...	...	...	...	...
198	Redbone	Childish Gambino	291222	spotify:track:0wXuerDYiBnERgIpbb3JBR
199	You're On Your Own, Kid	Taylor Swift	290995	spotify:track:4D7BCuvgdJlYvlX5WlN54t
200	Fall In Love	Bailey Zimmerman	290535	spotify:track:5gVCfYmQRPy1QJifP8f5gg

	track_name	artist_names	streams	uri
rank
12	Creepin' (with The Weeknd & 21 Savage)	Metro Boomin, The Weeknd, 21 Savage	783095	spotify:track:2dHHgzDwk4BJdRwy9uXhTO
26	Die For You	The Weeknd	658995	spotify:track:2LBqCSwhJGcFQeTHMVGwy3
41	Die For You (with Ariana Grande) - Remix	The Weeknd, Ariana Grande	572829	spotify:track:4W4fNrZYkobj539TOWsLO2
97	Starboy	The Weeknd, Daft Punk	387522	spotify:track:7MXVkk9YMctZqd1Srtv4MB
110	Stargirl Interlude	The Weeknd, Lana Del Rey	373574	spotify:track:5gDWsRxpJ2lZAffh5p7K0w
171	The Hills	The Weeknd	312251	spotify:track:7fBv7CLKzipRk6EC6TWHOB
180	Blinding Lights	The Weeknd	305384	spotify:track:0VjIjW4GlUZAMYd2vXMi3b

Lecture 6 – Data Visualization 📈¶

DSC 10, Spring 2023¶

Announcements¶

Don't forget about these resources!¶

Agenda¶

Aside: Keyboard shortcuts¶

Recap: GroupBy¶

Which neighborhood had the most requests?¶

Example: Number of different services¶

Observation #4¶

Why visualize?¶

Little Women¶

Napoleon's March¶

Why visualize?¶

Terminology¶

Individuals and variables¶

Types of variables¶

Examples of numerical variables¶

Examples of categorical variables¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Types of visualizations¶

Scatter plots¶

Dataset of 50 top-grossing actors¶

Scatter plots¶

Scatter plots¶

Scatter plots¶

Who was in 60 or more movies?¶

Who is the outlier?¶

Line plots 📉¶

Dataset aggregating movies by year¶

Line plots¶

Line plots¶

Plotting tip¶

Zooming in¶

How did this affect total gross?¶

What was the top grossing movie of 2018?¶

Extra video on line plots¶

Bar charts 📊¶

Dataset of the top 200 songs in the US on Spotify as of Thursday (4/13/2023)¶

Bar charts¶

Bar charts¶

Aside: How many streams did The Weeknd's songs on the chart receive?¶

How do we include songs with other artists, as well?¶

Fun demo 🎵¶

Let's find the URI of a song we care about.¶

Summary¶

Summary¶

Concept Check ✅ – Answer at cc.dsc10.com ¶