# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd

%reload_ext pandas_tutor
%set_pandas_tutor_options {'projectorMode': True}

import matplotlib.pyplot as plt
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats("svg")
plt.style.use('ggplot')


requests = bpd.read_csv('data/get-it-done-requests.csv')
requests = requests.assign(total=requests.get('closed') + requests.get('open'))
requests


weed_cleanup_only = requests[requests.get('service') == 'Weed Cleanup']
weed_cleanup_only


weed_cleanup_sorted = weed_cleanup_only.sort_values(by='total', ascending=False)
weed_cleanup_sorted


weed_cleanup_sorted.get('neighborhood').iloc[0]

'Southeastern San Diego'


requests[requests.get('service') == 'Lime Cleanup']


# A
requests[requests.get('neighborhood') == 'Downtown'].get('total').sum()

26211


(requests[requests.get('neighborhood') == 'University']
 .sort_values(by='total', ascending=False)
 .get('service').iloc[0]
)

'Parking'


requests[(requests.get('service') == 'Pothole') | (requests.get('service') == 'Pavement Maintenance')]


# You can add line breaks within brackets or parentheses
requests[(requests.get('service') == 'Pothole') | 
         (requests.get('service') == 'Pavement Maintenance')]


(requests.get('service') == 'Pothole')

0       False
1       False
2       False
3       False
4       False
        ...  
1582    False
1583    False
1584     True
1585    False
1586    False
Name: service, Length: 1587, dtype: bool


(requests.get('service') == 'Pavement Maintenance')

0       False
1       False
2       False
3       False
4       False
        ...  
1582    False
1583     True
1584    False
1585    False
1586    False
Name: service, Length: 1587, dtype: bool


(requests.get('service') == 'Pothole') | (requests.get('service') == 'Pavement Maintenance')

0       False
1       False
2       False
3       False
4       False
        ...  
1582    False
1583     True
1584     True
1585    False
1586    False
Name: service, Length: 1587, dtype: bool


requests[(requests.get('service') == 'Pothole') | 
         (requests.get('service') == 'Pavement Maintenance')].get('total').sum()

13980


# The answer is B, as only the second question requires multiple conditions

# 1. Which neighborhood had the most 'Street Flooded' requests?

(requests[requests.get('service') == 'Street Flooded']
 .sort_values(by='total', ascending=False)
 .get('neighborhood').iloc[0]
)

'North Park'


#2. In the 'Kearny Mesa' neighborhood, how many different types of services have open requests?

requests[(requests.get('neighborhood') == 'Kearny Mesa') & 
         (requests.get('open') > 0)].shape[0]

20


#3. How many requests have been closed in the 'La Jolla' neighborhood?

requests[requests.get('neighborhood') == 'La Jolla'].get('closed').sum()

5356


requests[requests.get('neighborhood') == 'Carmel Valley'].get('total').sum()

1992


requests[requests.get('neighborhood') == 'Torrey Hills'].get('total').sum()

305


requests.groupby('neighborhood').sum()


pets = bpd.DataFrame().assign(
    Species=['dog', 'cat', 'cat', 'dog', 'dog', 'hamster'],
    Color=['black', 'golden', 'black', 'white', 'golden', 'golden'],
    Weight=[40, 15, 20, 80, 25, 1],
    Age=[5, 8, 9, 2, 0.5, 3]
)
pets


%%pt

pets.groupby('Species').mean()


# Without Pandas Tutor
pets.groupby('Species').mean()


%%pt

# With Pandas Tutor
pets.groupby('Species').mean()


requests


requests.groupby('neighborhood').sum()


requests


requests.groupby('neighborhood').sum()


requests


requests.groupby('neighborhood').sum()


# Can you guess how the max position is determined?
requests.groupby('neighborhood').max()


requests.groupby('neighborhood').max()


# Use .index instead of .get to extract the index, since the index is not considered a column

requests.groupby('service').count().sort_values(by='total').index[0]

'Homeless Outreach'


num_neighborhoods = requests.groupby('service').count()
num_neighborhoods


num_neighborhoods = num_neighborhoods.assign(
                    neighborhoods_requesting=num_neighborhoods.get('total')
                    ).drop(columns=['neighborhood', 'closed', 'open', 'total'])
num_neighborhoods


imdb = bpd.read_csv('data/imdb.csv').set_index('Title').sort_values(by='Rating')
imdb


imdb.groupby('Decade').count()


# We'll learn how to make plots like this in the next lecture!
imdb.groupby('Decade').count().plot(y='Year');


%%pt
imdb[imdb.get('Decade') == 1990].sort_values('Rating', ascending=False).index[0]


%%pt
imdb.reset_index().groupby('Decade').max()


good_movies_per_year = imdb[imdb.get('Rating') > 8.5].groupby('Year').count()
good_movies_per_year[good_movies_per_year.get('Votes') > 3].shape[0]

1


(good_movies_per_year.get('Votes') > 3).sum()

1


more_than_3_ix = imdb.groupby('Year').count().get('Votes') > 3
imdb.groupby('Year').mean()[more_than_3_ix].sort_values(by='Rating').index[-1]

1994


(
    imdb.assign(title_length=imdb.index.str.len())
    .groupby('Year').mean()
    .sort_values(by='title_length')
    .index[-1]
)

1964


imdb[imdb.get('Year') == 1964]


# A Series of Trues and Falses; True when there were at least 3 movies on the list from that year
more_than_3_ix = imdb.groupby('Year').count().get('Votes') > 3

# The sum of the ratings of movies from years that had at least 3 movies on the list
total_rating = imdb.groupby('Year').sum()[more_than_3_ix].get('Rating').sum()

# The total number of movies from years that had at least 3 movies on the list
count = imdb.groupby('Year').count()[more_than_3_ix].get('Rating').sum()

# The correct answer
average_rating = total_rating / count
average_rating

8.262576687116566


# Close, but incorrect – doesn't account for the fact that different years have different numbers of movies on the list
close_but_wrong = imdb.groupby('Year').mean()[more_than_3_ix].get('Rating').mean()
close_but_wrong

8.264401041666668

	neighborhood	service	closed	open	total
0	Balboa Park	Dead Animal	46	0	46
1	Balboa Park	Development Services - Code Enforcement	2	0	2
2	Balboa Park	Encampment	1484	219	1703
3	Balboa Park	Environmental Services Code Compliance	25	1	26
4	Balboa Park	Graffiti	977	0	977
...	...	...	...	...	...
1582	Via De La Valle	Parking	1	0	1
1583	Via De La Valle	Pavement Maintenance	0	1	1
1584	Via De La Valle	Pothole	9	1	10
1585	Via De La Valle	Stormwater Code Enforcement	3	0	3
1586	Via De La Valle	Street Light Maintenance	1	0	1

symbol	meaning
`==`	equal to
`!=`	not equal to
`<`	less than
`<=`	less than or equal to
`>`	greater than
`>=`	greater than or equal to

	closed	open	total
neighborhood
Balboa Park	5003	773	5776
Barrio Logan	2158	518	2676
Black Mountain Ranch	331	63	394
Carmel Mountain Ranch	732	157	889
Carmel Valley	1641	351	1992
...	...	...	...
Torrey Hills	220	85	305
Torrey Pines	775	200	975
University	3435	479	3914
Uptown	11883	2561	14444
Via De La Valle	15	4	19

	neighborhood	service	closed	open	total
0	Balboa Park	Dead Animal	46	0	46
1	Balboa Park	Development Services - Code Enforcement	2	0	2
2	Balboa Park	Encampment	1484	219	1703
3	Balboa Park	Environmental Services Code Compliance	25	1	26
4	Balboa Park	Graffiti	977	0	977
...	...	...	...	...	...
1582	Via De La Valle	Parking	1	0	1
1583	Via De La Valle	Pavement Maintenance	0	1	1
1584	Via De La Valle	Pothole	9	1	10
1585	Via De La Valle	Stormwater Code Enforcement	3	0	3
1586	Via De La Valle	Street Light Maintenance	1	0	1

	closed	open	total
neighborhood
Balboa Park	5003	773	5776
Barrio Logan	2158	518	2676
Black Mountain Ranch	331	63	394
Carmel Mountain Ranch	732	157	889
Carmel Valley	1641	351	1992
...	...	...	...
Torrey Hills	220	85	305
Torrey Pines	775	200	975
University	3435	479	3914
Uptown	11883	2561	14444
Via De La Valle	15	4	19

	neighborhood	service	closed	open	total
30	Balboa Park	Weed Cleanup	23	0	23
61	Barrio Logan	Weed Cleanup	10	1	11
87	Black Mountain Ranch	Weed Cleanup	0	1	1
116	Carmel Mountain Ranch	Weed Cleanup	2	0	2
146	Carmel Valley	Weed Cleanup	6	1	7
...	...	...	...	...	...
1433	Tijuana River Valley	Weed Cleanup	2	0	2
1489	Torrey Hills	Weed Cleanup	1	0	1
1518	Torrey Pines	Weed Cleanup	10	7	17
1549	University	Weed Cleanup	53	10	63
1580	Uptown	Weed Cleanup	36	8	44

	neighborhood	service	closed	open	total
1383	Southeastern San Diego	Weed Cleanup	72	7	79
807	Navajo	Weed Cleanup	66	1	67
177	Clairemont Mesa	Weed Cleanup	55	11	66
1549	University	Weed Cleanup	53	10	63
1352	Skyline-Paradise Hills	Weed Cleanup	52	8	60
...	...	...	...	...	...
268	East Elliott	Weed Cleanup	1	0	1
309	Fairbanks Ranch Country Club	Weed Cleanup	1	0	1
1489	Torrey Hills	Weed Cleanup	1	0	1
87	Black Mountain Ranch	Weed Cleanup	0	1	1
746	Mission Beach	Weed Cleanup	1	0	1

	Species	Color	Weight	Age
0	dog	black	40	5.0
1	cat	golden	15	8.0
2	cat	black	20	9.0
3	dog	white	80	2.0
4	dog	golden	25	0.5
5	hamster	golden	1	3.0

	Votes	Rating	Year	Decade
Title
Akira	91652	8.0	1988	1980
Per un pugno di dollari	124671	8.0	1964	1960
Guardians of the Galaxy	527349	8.0	2014	2010
The Man Who Shot Liberty Valance	49135	8.0	1962	1960
Underground	39447	8.0	1995	1990
...	...	...	...	...
Schindler's List	761224	8.9	1993	1990
12 Angry Men	384187	8.9	1957	1950
The Godfather: Part II	692753	9.0	1974	1970
The Shawshank Redemption	1498733	9.2	1994	1990
The Godfather	1027398	9.2	1972	1970

	Votes	Rating	Year
Decade
1920	4	4	4
1930	7	7	7
1940	14	14	14
1950	30	30	30
1960	22	22	22
1970	21	21	21
1980	31	31	31
1990	42	42	42
2000	50	50	50
2010	29	29	29

Lecture 6 – More Queries and GroupBy¶

DSC 10, Fall 2022¶

Announcements¶

Agenda¶

Resources:¶

About the Data: Get It Done service requests 👷¶

Recap: queries¶

What is a query? 🤔¶

How do we query a DataFrame?¶

Element-wise comparisons¶

Example 6: Which neighborhood has the most 'Weed Cleanup' requests?¶

Strategy¶

What if the condition isn't satisfied?¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Activity 🚘¶

Example 7: How many service requests were for 'Pothole' or 'Pavement Maintenance'?¶

Multiple conditions¶

The & and | operators work element-wise¶

Original Question: How many service requests were for 'Pothole' or 'Pavement Maintenance'?¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Example 8: Which neighborhood had the most Get It Done requests?¶

Organizing requests by neighborhood¶

GroupBy: Split, aggregate, and combine¶

An illustrative example: Pets 🐱 🐶🐹¶

Visualizing pets.groupby('Species').mean()¶

Pandas Tutor¶

Back to Get It Done service requests 👷¶

Using .groupby in general¶

Observation #1¶

Observation #2¶

Disappearing columns ✨🐇🎩¶

Observation #3¶

Activity¶

Observation #4¶

More practice: IMDb dataset 🎞️¶

Question: How many movies appear from each decade?¶

Question: What was the highest rated movie of the 1990s?¶

Without grouping¶

With grouping¶

Challenge problems¶

Question: How many years have more than 3 movies rated above 8.5?¶

Aside: Using .sum() on a Boolean array/Series¶

Question: Out of the years with more than 3 movies, which had the highest average rating?¶

Question: Which year had the longest movie titles, on average?¶

Question: What is the average rating of movies from years that had at least 3 movies in the Top 250?¶

Summary, next time¶

Summary¶

Next time¶

Example 6: Which neighborhood has the most `'Weed Cleanup'` requests?¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Example 7: How many service requests were for `'Pothole'` or `'Pavement Maintenance'`?¶

The `&` and `|` operators work element-wise¶

Original Question: How many service requests were for `'Pothole'` or `'Pavement Maintenance'`?¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Visualizing `pets.groupby('Species').mean()`¶

Using `.groupby` in general¶

Aside: Using `.sum()` on a Boolean array/Series¶