# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd

import matplotlib.pyplot as plt
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
import pandas as pd
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)


requests = bpd.read_csv('data/get-it-done-requests.csv')
requests = requests.assign(total=requests.get('closed') + requests.get('open'))
requests


5 == 6

False


type(5 == 6)

bool


9 + 10 < 21

True


'zebra' == 'zeb' + 'ra'

True


requests


# A Boolean Series.
requests.get('closed') > 5

0        True
1       False
2        True
        ...  
1418     True
1419    False
1420    False
Name: closed, Length: 1421, dtype: bool


# A query.
requests[requests.get('closed') > 5]


# This DataFrame only contains rows where the 'service' is 'Pothole'!
only_potholes = requests[requests.get('service') == 'Pothole']
only_potholes


# You can space your code out like this if needed.
(
    only_potholes
    .sort_values('total', ascending=False)
    .get('neighborhood')
    .iloc[0]
)

'Clairemont Mesa'


requests[requests.get('service') == 'Car Maintenance']

...

Ellipsis

...

Ellipsis


requests[(requests.get('service') == 'Pothole') | (requests.get('service') == 'Dead Animal')]


# You can add line breaks within brackets or parentheses.
requests[(requests.get('service') == 'Pothole') | 
         (requests.get('service') == 'Dead Animal')]


(requests.get('service') == 'Pothole')

0       False
1       False
2       False
        ...  
1418     True
1419    False
1420    False
Name: service, Length: 1421, dtype: bool


(requests.get('service') == 'Dead Animal')

0        True
1       False
2       False
        ...  
1418    False
1419    False
1420    False
Name: service, Length: 1421, dtype: bool


(requests.get('service') == 'Pothole') | (requests.get('service') == 'Dead Animal')

0        True
1       False
2       False
        ...  
1418     True
1419    False
1420    False
Name: service, Length: 1421, dtype: bool


requests[(requests.get('service') == 'Pothole') | 
         (requests.get('service') == 'Dead Animal')].get('total').sum()

18512

...

Ellipsis


requests


requests.take([1, 3, 5])


requests.get('service').iloc[[1, 3, 5]]

1    Development Services - Code Enforcement
3     Environmental Services Code Compliance
5                          Graffiti - Public
Name: service, dtype: object


requests.take(np.arange(5))


requests[requests.get('neighborhood') == 'Black Mountain Ranch'].get('total').sum()

195


requests[requests.get('neighborhood') == 'Uptown'].get('total').sum()

7345


requests.groupby('neighborhood').sum()


pets = bpd.DataFrame().assign(
    Species=['dog', 'cat', 'cat', 'dog', 'dog', 'hamster'],
    Color=['black', 'golden', 'black', 'white', 'golden', 'golden'],
    Weight=[40, 15, 20, 80, 25, 1],
    Age=[5, 8, 9, 2, 0.5, 3]
)
pets


pets.groupby('Species').mean()


requests


requests.groupby('neighborhood').sum()


# Note the use of .index – remember, the index isn't a column!
(
    requests
    .groupby('neighborhood')
    .sum()
    .sort_values(by='total', ascending=False)
    .index[0]
)

'Downtown'


requests


requests.groupby('neighborhood').sum()


requests


requests.groupby('neighborhood').sum()


# Can you guess how the max is determined?
requests.groupby('neighborhood').max()


requests.groupby('neighborhood').max()


# Why don't these numbers match those in the grouped DataFrame?
requests[(requests.get('neighborhood') == 'Balboa Park') & (requests.get('service') == 'Weed Cleanup')]


# How many different requests are there for the neighborhood 'University'?
requests[requests.get('neighborhood') == 'University']


# How do we find this result for every neighborhood?


num_diff_services = requests.groupby('neighborhood').count()
num_diff_services


num_diff_services = num_diff_services.assign(
                    count_of_services=num_diff_services.get('open')
                    ).drop(columns=['service', 'closed', 'open', 'total'])
num_diff_services


from IPython.display import YouTubeVideo
YouTubeVideo('xg7rnjWnZ48')


imdb = bpd.read_csv('data/imdb.csv').set_index('Title').sort_values(by='Rating')
imdb


imdb.groupby('Decade').count()


# We'll learn how to make plots like this in the next lecture!
imdb.groupby('Decade').count().plot(y='Year');


imdb[imdb.get('Decade') == 1990].sort_values('Rating', ascending=False).index[0]

'The Shawshank Redemption'


imdb.reset_index().groupby('Decade').max()

symbol	meaning
`==`	equal to
`!=`	not equal to
`<`	less than
`<=`	less than or equal to
`>`	greater than
`>=`	greater than or equal to

	service	closed	open	total
neighborhood
Balboa Park	28	28	28	28
Barrio Logan	28	28	28	28
Black Mountain Ranch	24	24	24	24
...	...	...	...	...
University	30	30	30	30
Uptown	31	31	31	31
Via De La Valle	6	6	6	6

	Votes	Rating	Year	Decade
Title
Akira	91652	8.0	1988	1980
Per un pugno di dollari	124671	8.0	1964	1960
Guardians of the Galaxy	527349	8.0	2014	2010
...	...	...	...	...
The Godfather: Part II	692753	9.0	1974	1970
The Shawshank Redemption	1498733	9.2	1994	1990
The Godfather	1027398	9.2	1972	1970

	Title	Votes	Rating	Year
Decade
1920	The Kid	98794	8.3	1927
1930	The Wizard of Oz	259235	8.5	1939
1940	The Treasure of the Sierra Madre	350551	8.6	1949
...	...	...	...	...
1990	Unforgiven	1498733	9.2	1999
2000	Yip Man	1473049	8.9	2009
2010	X-Men: Days of Future Past	1271949	8.7	2015

	neighborhood	service	closed	open	total
0	Balboa Park	Dead Animal	11	0	11
1	Balboa Park	Development Services - Code Enforcement	2	0	2
2	Balboa Park	Encampment	215	20	235
...	...	...	...	...	...
1418	Via De La Valle	Pothole	11	7	18
1419	Via De La Valle	Sidewalk Repair Issue	0	1	1
1420	Via De La Valle	Street Sweeping	1	0	1

	closed	open	total
neighborhood
Balboa Park	1173	261	1434
Barrio Logan	1185	201	1386
Black Mountain Ranch	151	44	195
...	...	...	...
University	1614	620	2234
Uptown	5654	1691	7345
Via De La Valle	13	12	25

	Species	Color	Weight	Age
0	dog	black	40	5.0
1	cat	golden	15	8.0
2	cat	black	20	9.0
3	dog	white	80	2.0
4	dog	golden	25	0.5
5	hamster	golden	1	3.0

	service	closed	open	total
neighborhood
Balboa Park	Weed Cleanup	343	57	368
Barrio Logan	Weed Cleanup	286	48	317
Black Mountain Ranch	Tree Maintenance	41	14	44
...	...	...	...	...
University	Weed Cleanup	712	249	961
Uptown	Weed Cleanup	1422	384	1806
Via De La Valle	Street Sweeping	11	7	18

	neighborhood	service	closed	open	total
1354	University	Dead Animal	25	0	25
1355	University	Development Services - Code Enforcement	7	2	9
1356	University	Encampment	55	27	82
...	...	...	...	...	...
1381	University	Tree Maintenance	47	8	55
1382	University	Waste on Private Property	5	1	6
1383	University	Weed Cleanup	1	4	5

Lecture 5 – More Querying and GroupBy¶

DSC 10, Spring 2023¶

Announcements¶

Agenda¶

Don't forget about these resources!¶

You belong here! 🫂¶

About the Data: Get It Done service requests 👷¶

Recap: Queries¶

What is a query? 🤔¶

Comparison operators¶

How do we query a DataFrame?¶

Example 5: Which neighborhood has the most 'Pothole' requests? 🕳¶

Strategy¶

What if the condition isn't satisfied?¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Activity 🚘¶

Example 6: How many service requests were for 'Pothole' or 'Dead Animal'?¶

Multiple conditions¶

The & and | operators work element-wise!¶

Original Question: How many service requests were for 'Pothole' or 'Dead Animal'?¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Aside: Using .take to select rows by position¶

Example 7: Which neighborhood had the most requests?¶

Organizing requests by neighborhood¶

A new method: .groupby¶

An illustrative example: Pets 🐱 🐶🐹¶

Step 1: Split¶

Step 2: Aggregate¶

Step 3: Combine¶

Let's try it out!¶

Back to Get It Done service requests 👷¶

Using .groupby in general¶

Observation #1¶

Observation #2¶

Disappearing columns ✨🐇🎩¶

Observation #3¶

Example: Number of different services¶

Observation #4¶

More practice: IMDb dataset 🎞️¶

Challenge problems!¶

Question: How many movies appear from each decade?¶

Question: What was the highest rated movie of the 1990s?¶

Without grouping¶

With grouping¶

Question: How many years have more than 3 movies rated above 8.5?¶

Aside: Using .sum() on a boolean array¶

Question: Out of the years with more than 3 movies, which had the highest average rating?¶

Question: Which year had the longest movie titles, on average?¶

Question: What is the average rating of movies from years that had at least 3 movies in the Top 250?¶

Summary, next time¶

Summary¶

Next time¶

Example 5: Which neighborhood has the most `'Pothole'` requests? 🕳¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Example 6: How many service requests were for `'Pothole'` or `'Dead Animal'`?¶

The `&` and `|` operators work element-wise!¶

Original Question: How many service requests were for `'Pothole'` or `'Dead Animal'`?¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Aside: Using `.take` to select rows by position¶

A new method: `.groupby`¶

Using `.groupby` in general¶

Aside: Using `.sum()` on a boolean array¶