# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
%reload_ext pandas_tutor
%set_pandas_tutor_options {'projectorMode': True}
import matplotlib.pyplot as plt
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats("svg")
plt.style.use('ggplot')
The requests
DataFrame contains a summary of all service requests so far in 2022, broken down by neighborhood and service.
requests = bpd.read_csv('data/get-it-done-requests.csv')
requests = requests.assign(total=requests.get('closed') + requests.get('open'))
requests
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
0 | Balboa Park | Dead Animal | 46 | 0 | 46 |
1 | Balboa Park | Development Services - Code Enforcement | 2 | 0 | 2 |
2 | Balboa Park | Encampment | 1484 | 219 | 1703 |
3 | Balboa Park | Environmental Services Code Compliance | 25 | 1 | 26 |
4 | Balboa Park | Graffiti | 977 | 0 | 977 |
... | ... | ... | ... | ... | ... |
1582 | Via De La Valle | Parking | 1 | 0 | 1 |
1583 | Via De La Valle | Pavement Maintenance | 0 | 1 | 1 |
1584 | Via De La Valle | Pothole | 9 | 1 | 10 |
1585 | Via De La Valle | Stormwater Code Enforcement | 3 | 0 | 3 |
1586 | Via De La Valle | Street Light Maintenance | 1 | 0 | 1 |
1587 rows × 5 columns
To select only certain rows of requests
:
True
s (keep) and False
s (toss), usually by making a comparison.requests[sequence_goes_here]
.There are several types of comparisons we can make.
symbol | meaning |
---|---|
== |
equal to |
!= |
not equal to |
< |
less than |
<= |
less than or equal to |
> |
greater than |
>= |
greater than or equal to |
'Weed Cleanup'
requests?¶Key concept: Selecting rows (via Boolean indexing).
'Weed Cleanup'
requests.'total'
in descending order.'neighborhood'
column.weed_cleanup_only = requests[requests.get('service') == 'Weed Cleanup']
weed_cleanup_only
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
30 | Balboa Park | Weed Cleanup | 23 | 0 | 23 |
61 | Barrio Logan | Weed Cleanup | 10 | 1 | 11 |
87 | Black Mountain Ranch | Weed Cleanup | 0 | 1 | 1 |
116 | Carmel Mountain Ranch | Weed Cleanup | 2 | 0 | 2 |
146 | Carmel Valley | Weed Cleanup | 6 | 1 | 7 |
... | ... | ... | ... | ... | ... |
1433 | Tijuana River Valley | Weed Cleanup | 2 | 0 | 2 |
1489 | Torrey Hills | Weed Cleanup | 1 | 0 | 1 |
1518 | Torrey Pines | Weed Cleanup | 10 | 7 | 17 |
1549 | University | Weed Cleanup | 53 | 10 | 63 |
1580 | Uptown | Weed Cleanup | 36 | 8 | 44 |
53 rows × 5 columns
weed_cleanup_sorted = weed_cleanup_only.sort_values(by='total', ascending=False)
weed_cleanup_sorted
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
1383 | Southeastern San Diego | Weed Cleanup | 72 | 7 | 79 |
807 | Navajo | Weed Cleanup | 66 | 1 | 67 |
177 | Clairemont Mesa | Weed Cleanup | 55 | 11 | 66 |
1549 | University | Weed Cleanup | 53 | 10 | 63 |
1352 | Skyline-Paradise Hills | Weed Cleanup | 52 | 8 | 60 |
... | ... | ... | ... | ... | ... |
268 | East Elliott | Weed Cleanup | 1 | 0 | 1 |
309 | Fairbanks Ranch Country Club | Weed Cleanup | 1 | 0 | 1 |
1489 | Torrey Hills | Weed Cleanup | 1 | 0 | 1 |
87 | Black Mountain Ranch | Weed Cleanup | 0 | 1 | 1 |
746 | Mission Beach | Weed Cleanup | 1 | 0 | 1 |
53 rows × 5 columns
weed_cleanup_sorted.get('neighborhood').iloc[0]
'Southeastern San Diego'
requests[requests.get('service') == 'Lime Cleanup']
neighborhood | service | closed | open | total |
---|
Which expression below evaluates to the total number of service requests in the 'Downtown'
neighborhood?
A. requests[requests.get('neighborhood') == 'Downtown'].get('total').sum()
B. requests.get('total').sum()[requests.get('neighborhood') == 'Downtown']
C. requests['Downtown'].get('total').sum()
D. More than one of the above.
# A
requests[requests.get('neighborhood') == 'Downtown'].get('total').sum()
26211
Question: What is the most commonly requested service in the 'University'
neighborhood (near UCSD)?
Write one line of code that evaluates to the answer.
(requests[requests.get('neighborhood') == 'University']
.sort_values(by='total', ascending=False)
.get('service').iloc[0]
)
'Parking'
'Pothole'
or 'Pavement Maintenance'
?¶Key concept: Queries with multiple conditions.
requests[(requests.get('service') == 'Pothole') | (requests.get('service') == 'Pavement Maintenance')]
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
12 | Balboa Park | Pavement Maintenance | 15 | 10 | 25 |
13 | Balboa Park | Pothole | 109 | 1 | 110 |
43 | Barrio Logan | Pavement Maintenance | 10 | 12 | 22 |
44 | Barrio Logan | Pothole | 128 | 0 | 128 |
72 | Black Mountain Ranch | Pavement Maintenance | 2 | 2 | 4 |
... | ... | ... | ... | ... | ... |
1532 | University | Pothole | 363 | 2 | 365 |
1562 | Uptown | Pavement Maintenance | 46 | 110 | 156 |
1563 | Uptown | Pothole | 597 | 9 | 606 |
1583 | Via De La Valle | Pavement Maintenance | 0 | 1 | 1 |
1584 | Via De La Valle | Pothole | 9 | 1 | 10 |
109 rows × 5 columns
# You can add line breaks within brackets or parentheses
requests[(requests.get('service') == 'Pothole') |
(requests.get('service') == 'Pavement Maintenance')]
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
12 | Balboa Park | Pavement Maintenance | 15 | 10 | 25 |
13 | Balboa Park | Pothole | 109 | 1 | 110 |
43 | Barrio Logan | Pavement Maintenance | 10 | 12 | 22 |
44 | Barrio Logan | Pothole | 128 | 0 | 128 |
72 | Black Mountain Ranch | Pavement Maintenance | 2 | 2 | 4 |
... | ... | ... | ... | ... | ... |
1532 | University | Pothole | 363 | 2 | 365 |
1562 | Uptown | Pavement Maintenance | 46 | 110 | 156 |
1563 | Uptown | Pothole | 597 | 9 | 606 |
1583 | Via De La Valle | Pavement Maintenance | 0 | 1 | 1 |
1584 | Via De La Valle | Pothole | 9 | 1 | 10 |
109 rows × 5 columns
&
and |
operators work element-wise¶(requests.get('service') == 'Pothole')
0 False 1 False 2 False 3 False 4 False ... 1582 False 1583 False 1584 True 1585 False 1586 False Name: service, Length: 1587, dtype: bool
(requests.get('service') == 'Pavement Maintenance')
0 False 1 False 2 False 3 False 4 False ... 1582 False 1583 True 1584 False 1585 False 1586 False Name: service, Length: 1587, dtype: bool
(requests.get('service') == 'Pothole') | (requests.get('service') == 'Pavement Maintenance')
0 False 1 False 2 False 3 False 4 False ... 1582 False 1583 True 1584 True 1585 False 1586 False Name: service, Length: 1587, dtype: bool
'Pothole'
or 'Pavement Maintenance'
?¶requests[(requests.get('service') == 'Pothole') |
(requests.get('service') == 'Pavement Maintenance')].get('total').sum()
13980
Each of the following questions can be answered by querying the requests
DataFrame.
'Street Flooded'
requests?'Kearny Mesa'
neighborhood, how many different types of services have open requests?'La Jolla'
neighborhood?How many of the questions above require the query to have multiple conditions?
A. 0 B. 1 C. 2 D. 3
Bonus: Try to write the code to answer each question.
# The answer is B, as only the second question requires multiple conditions
# 1. Which neighborhood had the most 'Street Flooded' requests?
(requests[requests.get('service') == 'Street Flooded']
.sort_values(by='total', ascending=False)
.get('neighborhood').iloc[0]
)
'North Park'
#2. In the 'Kearny Mesa' neighborhood, how many different types of services have open requests?
requests[(requests.get('neighborhood') == 'Kearny Mesa') &
(requests.get('open') > 0)].shape[0]
20
#3. How many requests have been closed in the 'La Jolla' neighborhood?
requests[requests.get('neighborhood') == 'La Jolla'].get('closed').sum()
5356
Key concept: Grouping by one column.
requests[requests.get('neighborhood') == 'Carmel Valley'].get('total').sum()
.requests[requests.get('neighborhood') == 'Carmel Valley'].get('total').sum()
1992
requests[requests.get('neighborhood') == 'Torrey Hills'].get('total').sum()
305
It seems like there has to be a better way. And there is!
Observe what happens when we use the .groupby
method on requests
with the argument 'neighborhood'
.
requests.groupby('neighborhood').sum()
closed | open | total | |
---|---|---|---|
neighborhood | |||
Balboa Park | 5003 | 773 | 5776 |
Barrio Logan | 2158 | 518 | 2676 |
Black Mountain Ranch | 331 | 63 | 394 |
Carmel Mountain Ranch | 732 | 157 | 889 |
Carmel Valley | 1641 | 351 | 1992 |
... | ... | ... | ... |
Torrey Hills | 220 | 85 | 305 |
Torrey Pines | 775 | 200 | 975 |
University | 3435 | 479 | 3914 |
Uptown | 11883 | 2561 | 14444 |
Via De La Valle | 15 | 4 | 19 |
57 rows × 3 columns
Note that the 'total'
counts for Carmel Valley and Torrey Hills are the same as we saw on the previous slide. What just happened? 🤯
Consider the DataFrame pets
containing pet species, colors, and weights.
pets = bpd.DataFrame().assign(
Species=['dog', 'cat', 'cat', 'dog', 'dog', 'hamster'],
Color=['black', 'golden', 'black', 'white', 'golden', 'golden'],
Weight=[40, 15, 20, 80, 25, 1],
Age=[5, 8, 9, 2, 0.5, 3]
)
pets
Species | Color | Weight | Age | |
---|---|---|---|---|
0 | dog | black | 40 | 5.0 |
1 | cat | golden | 15 | 8.0 |
2 | cat | black | 20 | 9.0 |
3 | dog | white | 80 | 2.0 |
4 | dog | golden | 25 | 0.5 |
5 | hamster | golden | 1 | 3.0 |
pets.groupby('Species').mean()
¶pets
into "groups" according to their values in the 'Species'
column.'Species'
by taking the mean
of all numerical columns.'Species'
and sorted by 'Species'
in ascending order.Note that the result contains just one row for cats, one row for dogs, and one row for hamsters!
%%pt
pets.groupby('Species').mean()
pandas
(and babypandas
) code.%%pt
to the top of a code cell to explain the last line of babypandas
code.# Without Pandas Tutor
pets.groupby('Species').mean()
Weight | Age | |
---|---|---|
Species | ||
cat | 17.500000 | 8.5 |
dog | 48.333333 | 2.5 |
hamster | 1.000000 | 3.0 |
%%pt
# With Pandas Tutor
pets.groupby('Species').mean()
requests
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
0 | Balboa Park | Dead Animal | 46 | 0 | 46 |
1 | Balboa Park | Development Services - Code Enforcement | 2 | 0 | 2 |
2 | Balboa Park | Encampment | 1484 | 219 | 1703 |
3 | Balboa Park | Environmental Services Code Compliance | 25 | 1 | 26 |
4 | Balboa Park | Graffiti | 977 | 0 | 977 |
... | ... | ... | ... | ... | ... |
1582 | Via De La Valle | Parking | 1 | 0 | 1 |
1583 | Via De La Valle | Pavement Maintenance | 0 | 1 | 1 |
1584 | Via De La Valle | Pothole | 9 | 1 | 10 |
1585 | Via De La Valle | Stormwater Code Enforcement | 3 | 0 | 3 |
1586 | Via De La Valle | Street Light Maintenance | 1 | 0 | 1 |
1587 rows × 5 columns
requests.groupby('neighborhood').sum()
closed | open | total | |
---|---|---|---|
neighborhood | |||
Balboa Park | 5003 | 773 | 5776 |
Barrio Logan | 2158 | 518 | 2676 |
Black Mountain Ranch | 331 | 63 | 394 |
Carmel Mountain Ranch | 732 | 157 | 889 |
Carmel Valley | 1641 | 351 | 1992 |
... | ... | ... | ... |
Torrey Hills | 220 | 85 | 305 |
Torrey Pines | 775 | 200 | 975 |
University | 3435 | 479 | 3914 |
Uptown | 11883 | 2561 | 14444 |
Via De La Valle | 15 | 4 | 19 |
57 rows × 3 columns
.groupby
in general¶In short, .groupby
aggregates all rows with the same value in a specified column (e.g. 'neighborhood'
) into a single row in the resulting DataFrame, using an aggregation method (e.g. .sum()
) to combine values.
.groupby(column_name)
will gather rows which have the same value in the specified column (column_name
).'neighborhood'
.'neighborhood'
..sum()
method to every 'neighborhood'
.'closed'
, 'open'
, and 'total'
). .count()
, .sum()
, .mean()
, .median()
, .max()
, and .min()
.requests
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
0 | Balboa Park | Dead Animal | 46 | 0 | 46 |
1 | Balboa Park | Development Services - Code Enforcement | 2 | 0 | 2 |
2 | Balboa Park | Encampment | 1484 | 219 | 1703 |
3 | Balboa Park | Environmental Services Code Compliance | 25 | 1 | 26 |
4 | Balboa Park | Graffiti | 977 | 0 | 977 |
... | ... | ... | ... | ... | ... |
1582 | Via De La Valle | Parking | 1 | 0 | 1 |
1583 | Via De La Valle | Pavement Maintenance | 0 | 1 | 1 |
1584 | Via De La Valle | Pothole | 9 | 1 | 10 |
1585 | Via De La Valle | Stormwater Code Enforcement | 3 | 0 | 3 |
1586 | Via De La Valle | Street Light Maintenance | 1 | 0 | 1 |
1587 rows × 5 columns
requests.groupby('neighborhood').sum()
closed | open | total | |
---|---|---|---|
neighborhood | |||
Balboa Park | 5003 | 773 | 5776 |
Barrio Logan | 2158 | 518 | 2676 |
Black Mountain Ranch | 331 | 63 | 394 |
Carmel Mountain Ranch | 732 | 157 | 889 |
Carmel Valley | 1641 | 351 | 1992 |
... | ... | ... | ... |
Torrey Hills | 220 | 85 | 305 |
Torrey Pines | 775 | 200 | 975 |
University | 3435 | 479 | 3914 |
Uptown | 11883 | 2561 | 14444 |
Via De La Valle | 15 | 4 | 19 |
57 rows × 3 columns
The 'service'
column has disappeared. Why?
requests
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
0 | Balboa Park | Dead Animal | 46 | 0 | 46 |
1 | Balboa Park | Development Services - Code Enforcement | 2 | 0 | 2 |
2 | Balboa Park | Encampment | 1484 | 219 | 1703 |
3 | Balboa Park | Environmental Services Code Compliance | 25 | 1 | 26 |
4 | Balboa Park | Graffiti | 977 | 0 | 977 |
... | ... | ... | ... | ... | ... |
1582 | Via De La Valle | Parking | 1 | 0 | 1 |
1583 | Via De La Valle | Pavement Maintenance | 0 | 1 | 1 |
1584 | Via De La Valle | Pothole | 9 | 1 | 10 |
1585 | Via De La Valle | Stormwater Code Enforcement | 3 | 0 | 3 |
1586 | Via De La Valle | Street Light Maintenance | 1 | 0 | 1 |
1587 rows × 5 columns
requests.groupby('neighborhood').sum()
closed | open | total | |
---|---|---|---|
neighborhood | |||
Balboa Park | 5003 | 773 | 5776 |
Barrio Logan | 2158 | 518 | 2676 |
Black Mountain Ranch | 331 | 63 | 394 |
Carmel Mountain Ranch | 732 | 157 | 889 |
Carmel Valley | 1641 | 351 | 1992 |
... | ... | ... | ... |
Torrey Hills | 220 | 85 | 305 |
Torrey Pines | 775 | 200 | 975 |
University | 3435 | 479 | 3914 |
Uptown | 11883 | 2561 | 14444 |
Via De La Valle | 15 | 4 | 19 |
57 rows × 3 columns
.sum()
, in this case – is applied to each column.'service'
column.# Can you guess how the max position is determined?
requests.groupby('neighborhood').max()
service | closed | open | total | |
---|---|---|---|---|
neighborhood | ||||
Balboa Park | Weed Cleanup | 1484 | 219 | 1703 |
Barrio Logan | Weed Cleanup | 435 | 114 | 475 |
Black Mountain Ranch | Weed Cleanup | 120 | 20 | 121 |
Carmel Mountain Ranch | Weed Cleanup | 231 | 45 | 231 |
Carmel Valley | Weed Cleanup | 460 | 111 | 467 |
... | ... | ... | ... | ... |
Torrey Hills | Weed Cleanup | 43 | 20 | 61 |
Torrey Pines | Weed Cleanup | 180 | 49 | 180 |
University | Weed Cleanup | 546 | 72 | 605 |
Uptown | Weed Cleanup | 2252 | 456 | 2708 |
Via De La Valle | Street Light Maintenance | 9 | 2 | 10 |
57 rows × 4 columns
requests.groupby('neighborhood').max()
service | closed | open | total | |
---|---|---|---|---|
neighborhood | ||||
Balboa Park | Weed Cleanup | 1484 | 219 | 1703 |
Barrio Logan | Weed Cleanup | 435 | 114 | 475 |
Black Mountain Ranch | Weed Cleanup | 120 | 20 | 121 |
Carmel Mountain Ranch | Weed Cleanup | 231 | 45 | 231 |
Carmel Valley | Weed Cleanup | 460 | 111 | 467 |
... | ... | ... | ... | ... |
Torrey Hills | Weed Cleanup | 43 | 20 | 61 |
Torrey Pines | Weed Cleanup | 180 | 49 | 180 |
University | Weed Cleanup | 546 | 72 | 605 |
Uptown | Weed Cleanup | 2252 | 456 | 2708 |
Via De La Valle | Street Light Maintenance | 9 | 2 | 10 |
57 rows × 4 columns
'Weed Cleanup'
requests in Balboa Park so far this year?'total'
column equal to the sum of the 'closed'
and 'open'
columns, as it originally was?Write a line of code that evaluates to the service that has been requested in the fewest number of neighborhoods.
Strategy:
.count()
, .sum()
, .mean()
, .median()
, .max()
, and .min()
.# Use .index instead of .get to extract the index, since the index is not considered a column
requests.groupby('service').count().sort_values(by='total').index[0]
'Homeless Outreach'
Note: On an assignment, in questions like this with multiple correct answers (a tie for the fewest), you can put any one of them. We'll accept any correct answer.
.groupby
don't make sense when using the .count()
aggregation method..assign
to create a new column containing the same values as the old column(s)..drop(columns=list_of_column_labels)
to drop the old column(s).num_neighborhoods = requests.groupby('service').count()
num_neighborhoods
neighborhood | closed | open | total | |
---|---|---|---|---|
service | ||||
Dead Animal | 54 | 54 | 54 | 54 |
Development Services - Code Enforcement | 47 | 47 | 47 | 47 |
Encampment | 54 | 54 | 54 | 54 |
Environmental Services Code Compliance | 47 | 47 | 47 | 47 |
Graffiti | 53 | 53 | 53 | 53 |
... | ... | ... | ... | ... |
Traffic Signal Timing | 49 | 49 | 49 | 49 |
Trash/Recycling Collection | 48 | 48 | 48 | 48 |
Tree Maintenance | 55 | 55 | 55 | 55 |
Waste on Private Property | 45 | 45 | 45 | 45 |
Weed Cleanup | 53 | 53 | 53 | 53 |
33 rows × 4 columns
num_neighborhoods = num_neighborhoods.assign(
neighborhoods_requesting=num_neighborhoods.get('total')
).drop(columns=['neighborhood', 'closed', 'open', 'total'])
num_neighborhoods
neighborhoods_requesting | |
---|---|
service | |
Dead Animal | 54 |
Development Services - Code Enforcement | 47 |
Encampment | 54 |
Environmental Services Code Compliance | 47 |
Graffiti | 53 |
... | ... |
Traffic Signal Timing | 49 |
Trash/Recycling Collection | 48 |
Tree Maintenance | 55 |
Waste on Private Property | 45 |
Weed Cleanup | 53 |
33 rows × 1 columns
imdb = bpd.read_csv('data/imdb.csv').set_index('Title').sort_values(by='Rating')
imdb
Votes | Rating | Year | Decade | |
---|---|---|---|---|
Title | ||||
Akira | 91652 | 8.0 | 1988 | 1980 |
Per un pugno di dollari | 124671 | 8.0 | 1964 | 1960 |
Guardians of the Galaxy | 527349 | 8.0 | 2014 | 2010 |
The Man Who Shot Liberty Valance | 49135 | 8.0 | 1962 | 1960 |
Underground | 39447 | 8.0 | 1995 | 1990 |
... | ... | ... | ... | ... |
Schindler's List | 761224 | 8.9 | 1993 | 1990 |
12 Angry Men | 384187 | 8.9 | 1957 | 1950 |
The Godfather: Part II | 692753 | 9.0 | 1974 | 1970 |
The Shawshank Redemption | 1498733 | 9.2 | 1994 | 1990 |
The Godfather | 1027398 | 9.2 | 1972 | 1970 |
250 rows × 4 columns
imdb.groupby('Decade').count()
Votes | Rating | Year | |
---|---|---|---|
Decade | |||
1920 | 4 | 4 | 4 |
1930 | 7 | 7 | 7 |
1940 | 14 | 14 | 14 |
1950 | 30 | 30 | 30 |
1960 | 22 | 22 | 22 |
1970 | 21 | 21 | 21 |
1980 | 31 | 31 | 31 |
1990 | 42 | 42 | 42 |
2000 | 50 | 50 | 50 |
2010 | 29 | 29 | 29 |
# We'll learn how to make plots like this in the next lecture!
imdb.groupby('Decade').count().plot(y='Year');
Let's try to do this two different ways.
%%pt
imdb[imdb.get('Decade') == 1990].sort_values('Rating', ascending=False).index[0]
Note: The command to extract the index of a DataFrame is .index
- no parentheses! This is different than the way we extract columns, with .get()
, because the index is not a column.
%%pt
imdb.reset_index().groupby('Decade').max()
.max()
), aggregation is done to each column individually. max
is not helpful here.We won't cover these problems in class, but they're here for you to practice with some harder examples. To access the solutions, you'll need to watch this solution walkthrough video (start at 10:00).
Before watching the video, make sure to try these problems on your own – they're great prep for homeworks, projects, and exams!
good_movies_per_year = imdb[imdb.get('Rating') > 8.5].groupby('Year').count()
good_movies_per_year[good_movies_per_year.get('Votes') > 3].shape[0]
1
.sum()
on a Boolean array/Series¶True
elements. This is because Python treats True
as 1 and False
as 0. (good_movies_per_year.get('Votes') > 3).sum()
1
more_than_3_ix = imdb.groupby('Year').count().get('Votes') > 3
imdb.groupby('Year').mean()[more_than_3_ix].sort_values(by='Rating').index[-1]
1994
Hint: Use .str.len()
on the column or index that contains the names of the movies.
(
imdb.assign(title_length=imdb.index.str.len())
.groupby('Year').mean()
.sort_values(by='title_length')
.index[-1]
)
1964
imdb[imdb.get('Year') == 1964]
Votes | Rating | Year | Decade | |
---|---|---|---|---|
Title | ||||
Per un pugno di dollari | 124671 | 8.0 | 1964 | 1960 |
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb | 309141 | 8.5 | 1964 | 1960 |
# A Series of Trues and Falses; True when there were at least 3 movies on the list from that year
more_than_3_ix = imdb.groupby('Year').count().get('Votes') > 3
# The sum of the ratings of movies from years that had at least 3 movies on the list
total_rating = imdb.groupby('Year').sum()[more_than_3_ix].get('Rating').sum()
# The total number of movies from years that had at least 3 movies on the list
count = imdb.groupby('Year').count()[more_than_3_ix].get('Rating').sum()
# The correct answer
average_rating = total_rating / count
average_rating
8.262576687116566
# Close, but incorrect – doesn't account for the fact that different years have different numbers of movies on the list
close_but_wrong = imdb.groupby('Year').mean()[more_than_3_ix].get('Rating').mean()
close_but_wrong
8.264401041666668
&
if you require all to be true, or |
if you require at least one to be true.df.groupby(column_name).agg_method()
aggregates all rows with the same value for column_name
into a single row in the resulting DataFrame, using agg_method()
to combine values..count()
, .sum()
, .mean()
, .median()
, .max()
, and .min()
.A picture is worth a 1000 words – it's time to visualize!