# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd

%reload_ext pandas_tutor
%set_pandas_tutor_options {'projectorMode': True}

import matplotlib.pyplot as plt
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats("svg")
plt.style.use('ggplot')


requests = bpd.read_csv('data/get-it-done-requests.csv')
requests = requests.assign(total=requests.get('closed') + requests.get('open'))
requests


tree_maintenance_only = requests[requests.get('service') == 'Tree Maintenance']
tree_maintenance_only


tree_maintenance_sorted = tree_maintenance_only.sort_values(by='total', ascending=False)
tree_maintenance_sorted


tree_maintenance_sorted.get('neighborhood').iloc[0]

'Southeastern San Diego'


requests[requests.get('service') == 'Car Maintenance']

...

Ellipsis

...

Ellipsis


requests[(requests.get('service') == 'Pothole') | (requests.get('service') == 'Pavement Maintenance')]


# You can add line breaks within brackets or parentheses
requests[(requests.get('service') == 'Pothole') | 
         (requests.get('service') == 'Pavement Maintenance')]


(requests.get('service') == 'Pothole')

0       False
1       False
2       False
3       False
4       False
        ...  
1057    False
1058    False
1059    False
1060    False
1061     True
Name: service, Length: 1062, dtype: bool


(requests.get('service') == 'Pavement Maintenance')

0       False
1       False
2       False
3       False
4       False
        ...  
1057    False
1058    False
1059    False
1060    False
1061    False
Name: service, Length: 1062, dtype: bool


(requests.get('service') == 'Pothole') | (requests.get('service') == 'Pavement Maintenance')

0       False
1       False
2       False
3       False
4       False
        ...  
1057    False
1058    False
1059    False
1060    False
1061     True
Name: service, Length: 1062, dtype: bool


requests[(requests.get('service') == 'Pothole') | 
         (requests.get('service') == 'Pavement Maintenance')].get('total').sum()

2805

...

Ellipsis


requests.take([1, 3, 5])


requests.take(np.arange(5))


requests[requests.get('neighborhood') == 'Carmel Valley'].get('total').sum()

144


requests[requests.get('neighborhood') == 'Uptown'].get('total').sum()

1128


requests.groupby('neighborhood').sum()


pets = bpd.DataFrame().assign(
    Species=['dog', 'cat', 'cat', 'dog', 'dog', 'hamster'],
    Color=['black', 'golden', 'black', 'white', 'golden', 'golden'],
    Weight=[40, 15, 20, 80, 25, 1],
    Age=[5, 8, 9, 2, 0.5, 3]
)
pets


%%pt

pets.groupby('Species').mean()


# Without Pandas Tutor
pets.groupby('Species').mean()


%%pt

# With Pandas Tutor
pets.groupby('Species').mean()


requests


requests.groupby('neighborhood').sum()


requests


requests.groupby('neighborhood').sum()


requests


requests.groupby('neighborhood').sum()


# Can you guess how the max is determined?
requests.groupby('neighborhood').max()


requests.groupby('neighborhood').max()


# Answering the question for one particular neighborhood, La Jolla
requests[requests.get('neighborhood') == 'La Jolla']

...

Ellipsis


num_diff_services = requests.groupby('neighborhood').count()
num_diff_services


num_diff_services = num_diff_services.assign(
                    count_of_services=num_diff_services.get('open')
                    ).drop(columns=['service', 'closed', 'open', 'total'])
num_diff_services


imdb = bpd.read_csv('data/imdb.csv').set_index('Title').sort_values(by='Rating')
imdb


imdb.groupby('Decade').count()


# We'll learn how to make plots like this in the next lecture!
imdb.groupby('Decade').count().plot(y='Year');


%%pt
imdb[imdb.get('Decade') == 1990].sort_values('Rating', ascending=False).index[0]


%%pt
imdb.reset_index().groupby('Decade').max()

symbol	meaning
`==`	equal to
`!=`	not equal to
`<`	less than
`<=`	less than or equal to
`>`	greater than
`>=`	greater than or equal to

	neighborhood	service	closed	open	total
1	Balboa Park	Encampment	22	16	38
3	Balboa Park	Graffiti - Code Enforcement	0	1	1
5	Balboa Park	Illegal Dumping	3	4	7

	neighborhood	service	closed	open	total
0	Balboa Park	Dead Animal	4	0	4
1	Balboa Park	Encampment	22	16	38
2	Balboa Park	Environmental Services Code Compliance	0	1	1
3	Balboa Park	Graffiti - Code Enforcement	0	1	1
4	Balboa Park	Graffiti - Public	62	37	99

	closed	open	total
neighborhood
Balboa Park	163	137	300
Barrio Logan	116	91	207
Black Mountain Ranch	27	24	51
Carmel Mountain Ranch	14	83	97
Carmel Valley	83	61	144
...	...	...	...
Torrey Hills	22	21	43
Torrey Pines	39	64	103
University	114	214	328
Uptown	592	536	1128
Via De La Valle	0	2	2

	closed	open	total
neighborhood
Balboa Park	163	137	300
Barrio Logan	116	91	207
Black Mountain Ranch	27	24	51
Carmel Mountain Ranch	14	83	97
Carmel Valley	83	61	144
...	...	...	...
Torrey Hills	22	21	43
Torrey Pines	39	64	103
University	114	214	328
Uptown	592	536	1128
Via De La Valle	0	2	2

	neighborhood	service	closed	open	total
21	Balboa Park	Tree Maintenance	2	7	9
41	Barrio Logan	Tree Maintenance	1	2	3
57	Black Mountain Ranch	Tree Maintenance	2	0	2
95	Carmel Valley	Tree Maintenance	6	2	8
122	Clairemont Mesa	Tree Maintenance	14	15	29
...	...	...	...	...	...
974	Torrey Highlands	Tree Maintenance	0	1	1
985	Torrey Hills	Tree Maintenance	1	1	2
1006	Torrey Pines	Tree Maintenance	1	2	3
1029	University	Tree Maintenance	5	9	14
1057	Uptown	Tree Maintenance	19	29	48

	neighborhood	service	closed	open	total
935	Southeastern San Diego	Tree Maintenance	12	44	56
596	North Park	Tree Maintenance	24	30	54
1057	Uptown	Tree Maintenance	19	29	48
179	Downtown	Tree Maintenance	15	28	43
712	Pacific Beach	Tree Maintenance	22	15	37
...	...	...	...	...	...
497	Mission Bay Park	Tree Maintenance	0	1	1
963	Tijuana River Valley	Tree Maintenance	0	1	1
974	Torrey Highlands	Tree Maintenance	0	1	1
638	Old Town San Diego	Tree Maintenance	1	0	1
518	Mission Beach	Tree Maintenance	1	0	1

	neighborhood	service	closed	open	total
9	Balboa Park	Pavement Maintenance	0	1	1
10	Balboa Park	Pothole	12	9	21
29	Barrio Logan	Pavement Maintenance	2	3	5
30	Barrio Logan	Pothole	1	8	9
65	Carmel Mountain Ranch	Pavement Maintenance	0	1	1
...	...	...	...	...	...
1015	University	Pavement Maintenance	0	1	1
1016	University	Pothole	15	101	116
1043	Uptown	Pavement Maintenance	1	9	10
1044	Uptown	Pothole	15	93	108
1061	Via De La Valle	Pothole	0	2	2

	Species	Color	Weight	Age
0	dog	black	40	5.0
1	cat	golden	15	8.0
2	cat	black	20	9.0
3	dog	white	80	2.0
4	dog	golden	25	0.5
5	hamster	golden	1	3.0

	neighborhood	service	closed	open	total
260	La Jolla	Dead Animal	2	0	2
261	La Jolla	Encampment	13	8	21
262	La Jolla	Environmental Services Code Compliance	1	3	4
263	La Jolla	Graffiti - Public	3	2	5
264	La Jolla	Illegal Dumping	8	7	15
...	...	...	...	...	...
279	La Jolla	Traffic Sign Maintenance	2	24	26
280	La Jolla	Traffic Signal Issue	11	1	12
281	La Jolla	Traffic Signal Timing	3	1	4
282	La Jolla	Trash/Recycling Collection	1	1	2
283	La Jolla	Tree Maintenance	11	14	25

	Votes	Rating	Year	Decade
Title
Akira	91652	8.0	1988	1980
Per un pugno di dollari	124671	8.0	1964	1960
Guardians of the Galaxy	527349	8.0	2014	2010
The Man Who Shot Liberty Valance	49135	8.0	1962	1960
Underground	39447	8.0	1995	1990
...	...	...	...	...
Schindler's List	761224	8.9	1993	1990
12 Angry Men	384187	8.9	1957	1950
The Godfather: Part II	692753	9.0	1974	1970
The Shawshank Redemption	1498733	9.2	1994	1990
The Godfather	1027398	9.2	1972	1970

	Votes	Rating	Year
Decade
1920	4	4	4
1930	7	7	7
1940	14	14	14
1950	30	30	30
1960	22	22	22
1970	21	21	21
1980	31	31	31
1990	42	42	42
2000	50	50	50
2010	29	29	29

Lecture 5 – More Querying and GroupBy¶

DSC 10, Winter 2023¶

Announcements¶

Agenda¶

Resources:¶

About the Data: Get It Done service requests 👷¶

Recap: queries¶

What is a query? 🤔¶

How do we query a DataFrame?¶

Element-wise comparisons¶

Example 5: Which neighborhood has the most 'Tree Maintenance' requests? 🌳¶

Strategy¶

What if the condition isn't satisfied?¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Activity 🚘¶

Example 6: How many service requests were for 'Pothole' or 'Pavement Maintenance'?¶

Multiple conditions¶

The & and | operators work element-wise¶

Original Question: How many service requests were for 'Pothole' or 'Pavement Maintenance'?¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Selecting rows by position with .take¶

Example 7: Which neighborhood had the most Get It Done requests?¶

Organizing requests by neighborhood¶

GroupBy: Split, aggregate, and combine¶

An illustrative example: Pets 🐱 🐶🐹¶

Visualizing pets.groupby('Species').mean()¶

Pandas Tutor¶

Back to Get It Done service requests 👷¶

Using .groupby in general¶

Observation #1¶

Observation #2¶

Disappearing columns ✨🐇🎩¶

Observation #3¶

Two choices to make when using .groupby¶

Observation #4¶

More practice: IMDb dataset 🎞️¶

Question: How many movies appear from each decade?¶

Question: What was the highest rated movie of the 1990s?¶

Without grouping¶

With grouping¶

Challenge problems¶

Question: How many years have more than 3 movies rated above 8.5?¶

Aside: Using .sum() on a Boolean array/Series¶

Question: Out of the years with more than 3 movies, which had the highest average rating?¶

Question: Which year had the longest movie titles, on average?¶

Question: What is the average rating of movies from years that had at least 3 movies in the Top 250?¶

Summary, next time¶

Summary¶

Next time¶

Example 5: Which neighborhood has the most `'Tree Maintenance'` requests? 🌳¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Example 6: How many service requests were for `'Pothole'` or `'Pavement Maintenance'`?¶

The `&` and `|` operators work element-wise¶

Original Question: How many service requests were for `'Pothole'` or `'Pavement Maintenance'`?¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Selecting rows by position with `.take`¶

Visualizing `pets.groupby('Species').mean()`¶

Using `.groupby` in general¶

Two choices to make when using `.groupby`¶

Aside: Using `.sum()` on a Boolean array/Series¶