# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
np.set_printoptions(threshold=20, precision=2, suppress=True)
import pandas as pd
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
Recall, the requests
DataFrame contains a summary of all service requests so far this year, broken down by neighborhood and service.
requests = bpd.read_csv('data/get-it-done-requests.csv')
requests = requests.assign(total=requests.get('closed') + requests.get('open'))
requests
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
0 | Balboa Park | Dead Animal | 11 | 0 | 11 |
1 | Balboa Park | Development Services - Code Enforcement | 2 | 0 | 2 |
2 | Balboa Park | Encampment | 215 | 20 | 235 |
... | ... | ... | ... | ... | ... |
1418 | Via De La Valle | Pothole | 11 | 7 | 18 |
1419 | Via De La Valle | Sidewalk Repair Issue | 0 | 1 | 1 |
1420 | Via De La Valle | Street Sweeping | 1 | 0 | 1 |
1421 rows × 5 columns
There are several types of comparisons we can make.
symbol | meaning |
---|---|
== |
equal to |
!= |
not equal to |
< |
less than |
<= |
less than or equal to |
> |
greater than |
>= |
greater than or equal to |
5 == 6
False
type(5 == 6)
bool
9 + 10 < 21
True
'zebra' == 'zeb' + 'ra'
True
To select only certain rows of requests
:
True
s (keep) and False
s (toss), usually by making a comparison.requests[sequence_goes_here]
.requests
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
0 | Balboa Park | Dead Animal | 11 | 0 | 11 |
1 | Balboa Park | Development Services - Code Enforcement | 2 | 0 | 2 |
2 | Balboa Park | Encampment | 215 | 20 | 235 |
... | ... | ... | ... | ... | ... |
1418 | Via De La Valle | Pothole | 11 | 7 | 18 |
1419 | Via De La Valle | Sidewalk Repair Issue | 0 | 1 | 1 |
1420 | Via De La Valle | Street Sweeping | 1 | 0 | 1 |
1421 rows × 5 columns
# A Boolean Series.
requests.get('closed') > 5
0 True 1 False 2 True ... 1418 True 1419 False 1420 False Name: closed, Length: 1421, dtype: bool
# A query.
requests[requests.get('closed') > 5]
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
0 | Balboa Park | Dead Animal | 11 | 0 | 11 |
2 | Balboa Park | Encampment | 215 | 20 | 235 |
3 | Balboa Park | Environmental Services Code Compliance | 8 | 1 | 9 |
... | ... | ... | ... | ... | ... |
1413 | Uptown | Waste on Private Property | 40 | 2 | 42 |
1414 | Uptown | Weed Cleanup | 21 | 3 | 24 |
1418 | Via De La Valle | Pothole | 11 | 7 | 18 |
920 rows × 5 columns
'Pothole'
requests? 🕳¶Key concept: Querying.
'Pothole'
requests.'total'
in descending order.'neighborhood'
column.# This DataFrame only contains rows where the 'service' is 'Pothole'!
only_potholes = requests[requests.get('service') == 'Pothole']
only_potholes
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
12 | Balboa Park | Pothole | 70 | 8 | 78 |
40 | Barrio Logan | Pothole | 111 | 9 | 120 |
67 | Black Mountain Ranch | Pothole | 19 | 1 | 20 |
... | ... | ... | ... | ... | ... |
1366 | University | Pothole | 712 | 249 | 961 |
1396 | Uptown | Pothole | 513 | 87 | 600 |
1418 | Via De La Valle | Pothole | 11 | 7 | 18 |
56 rows × 5 columns
# You can space your code out like this if needed.
(
only_potholes
.sort_values('total', ascending=False)
.get('neighborhood')
.iloc[0]
)
'Clairemont Mesa'
requests[requests.get('service') == 'Car Maintenance']
neighborhood | service | closed | open | total |
---|
Which expression below evaluates to the total number of service requests in the 'Downtown'
neighborhood?
A. requests[requests.get('neighborhood') == 'Downtown'].get('total').sum()
B. requests.get('total').sum()[requests.get('neighborhood') == 'Downtown']
C. requests['Downtown'].get('total').sum()
D. More than one of the above.
...
Ellipsis
Question: What is the most commonly requested service in the 'University'
neighborhood (near UCSD)?
Write one line of code that evaluates to the answer.
...
Ellipsis
'Pothole'
or 'Dead Animal'
?¶Key concept: Queries with multiple conditions.
&
for "and" and |
for "or".&
: All conditions must be true.|
: At least one condition must be true.(
parentheses)
around each condition!and
and or
here! They do not behave as you'd want.requests[(requests.get('service') == 'Pothole') | (requests.get('service') == 'Dead Animal')]
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
0 | Balboa Park | Dead Animal | 11 | 0 | 11 |
12 | Balboa Park | Pothole | 70 | 8 | 78 |
28 | Barrio Logan | Dead Animal | 2 | 0 | 2 |
... | ... | ... | ... | ... | ... |
1396 | Uptown | Pothole | 513 | 87 | 600 |
1415 | Via De La Valle | Dead Animal | 1 | 0 | 1 |
1418 | Via De La Valle | Pothole | 11 | 7 | 18 |
109 rows × 5 columns
# You can add line breaks within brackets or parentheses.
requests[(requests.get('service') == 'Pothole') |
(requests.get('service') == 'Dead Animal')]
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
0 | Balboa Park | Dead Animal | 11 | 0 | 11 |
12 | Balboa Park | Pothole | 70 | 8 | 78 |
28 | Barrio Logan | Dead Animal | 2 | 0 | 2 |
... | ... | ... | ... | ... | ... |
1396 | Uptown | Pothole | 513 | 87 | 600 |
1415 | Via De La Valle | Dead Animal | 1 | 0 | 1 |
1418 | Via De La Valle | Pothole | 11 | 7 | 18 |
109 rows × 5 columns
&
and |
operators work element-wise!¶(requests.get('service') == 'Pothole')
0 False 1 False 2 False ... 1418 True 1419 False 1420 False Name: service, Length: 1421, dtype: bool
(requests.get('service') == 'Dead Animal')
0 True 1 False 2 False ... 1418 False 1419 False 1420 False Name: service, Length: 1421, dtype: bool
(requests.get('service') == 'Pothole') | (requests.get('service') == 'Dead Animal')
0 True 1 False 2 False ... 1418 True 1419 False 1420 False Name: service, Length: 1421, dtype: bool
'Pothole'
or 'Dead Animal'
?¶requests[(requests.get('service') == 'Pothole') |
(requests.get('service') == 'Dead Animal')].get('total').sum()
18512
Each of the following questions can be answered by querying the requests
DataFrame.
'Street Flooded'
requests?'Kearny Mesa'
neighborhood, how many different types of services have open requests?'La Jolla'
neighborhood?How many of the questions above require the query to have multiple conditions?
A. 0 B. 1 C. 2 D. 3
Bonus: Try to write code to answer each question.
...
Ellipsis
.take
to select rows by position¶.take([list_of_integer_positions])
. This keeps only the rows whose positions are in the specified list..iloc[]
on a Series.requests
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
0 | Balboa Park | Dead Animal | 11 | 0 | 11 |
1 | Balboa Park | Development Services - Code Enforcement | 2 | 0 | 2 |
2 | Balboa Park | Encampment | 215 | 20 | 235 |
... | ... | ... | ... | ... | ... |
1418 | Via De La Valle | Pothole | 11 | 7 | 18 |
1419 | Via De La Valle | Sidewalk Repair Issue | 0 | 1 | 1 |
1420 | Via De La Valle | Street Sweeping | 1 | 0 | 1 |
1421 rows × 5 columns
requests.take([1, 3, 5])
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
1 | Balboa Park | Development Services - Code Enforcement | 2 | 0 | 2 |
3 | Balboa Park | Environmental Services Code Compliance | 8 | 1 | 9 |
5 | Balboa Park | Graffiti - Public | 343 | 25 | 368 |
requests.get('service').iloc[[1, 3, 5]]
1 Development Services - Code Enforcement 3 Environmental Services Code Compliance 5 Graffiti - Public Name: service, dtype: object
requests.take(np.arange(5))
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
0 | Balboa Park | Dead Animal | 11 | 0 | 11 |
1 | Balboa Park | Development Services - Code Enforcement | 2 | 0 | 2 |
2 | Balboa Park | Encampment | 215 | 20 | 235 |
3 | Balboa Park | Environmental Services Code Compliance | 8 | 1 | 9 |
4 | Balboa Park | Graffiti - Code Enforcement | 2 | 1 | 3 |
Key concept: Grouping by one column.
We can find the total number of Get It Done requests in any one neighborhood using the tools we already have.
requests[requests.get('neighborhood') == 'Black Mountain Ranch'].get('total').sum()
195
requests[requests.get('neighborhood') == 'Uptown'].get('total').sum()
7345
If we wanted to find the total number of requests in every neighborhood, this would be quite inconvenient... there has to be a better way!
.groupby
¶Observe what happens when we use the .groupby
method on requests
with the argument 'neighborhood'
.
requests.groupby('neighborhood').sum()
closed | open | total | |
---|---|---|---|
neighborhood | |||
Balboa Park | 1173 | 261 | 1434 |
Barrio Logan | 1185 | 201 | 1386 |
Black Mountain Ranch | 151 | 44 | 195 |
... | ... | ... | ... |
University | 1614 | 620 | 2234 |
Uptown | 5654 | 1691 | 7345 |
Via De La Valle | 13 | 12 | 25 |
57 rows × 3 columns
Note that the 'total'
counts for Black Mountain Ranch and Uptown are the same as we saw on the previous slide. What just happened? 🤯
Consider the DataFrame pets
, shown below.
Species | Color | Weight | Age | |
---|---|---|---|---|
0 | dog | black | 40 | 5.0 |
1 | cat | golden | 15 | 8.0 |
2 | cat | black | 20 | 9.0 |
3 | dog | white | 80 | 2.0 |
4 | dog | golden | 25 | 0.5 |
5 | hamster | golden | 1 | 3.0 |
When we run pets.groupby('Species').mean()
, babypandas
does three things under the hood.
First, it splits the rows of pets
into "groups" according to their values in the 'Species'
column.
Species | Color | Weight | Age | |
---|---|---|---|---|
0 | dog | black | 40 | 5.0 |
3 | dog | white | 80 | 2.0 |
4 | dog | golden | 25 | 0.5 |
Species | Color | Weight | Age | |
---|---|---|---|---|
1 | cat | golden | 15 | 8.0 |
2 | cat | black | 20 | 9.0 |
Species | Color | Weight | Age | |
---|---|---|---|---|
5 | hamster | golden | 1 | 3.0 |
Then, it aggregates the rows with the same value of 'Species'
by taking the mean
of all numerical columns.
Weight | Age | |
---|---|---|
dog | 48.33 | 2.5 |
Weight | Age | |
---|---|---|
cat | 17.5 | 8.5 |
Weight | Age | |
---|---|---|
hamster | 1.0 | 3.0 |
</div>
Finally, it combines these means into a new DataFrame that is indexed by 'Species'
and sorted by 'Species'
in ascending order.
Weight | Age | |
---|---|---|
Species | ||
cat | 17.50 | 8.5 |
dog | 48.33 | 2.5 |
hamster | 1.00 | 3.0 |
pets = bpd.DataFrame().assign(
Species=['dog', 'cat', 'cat', 'dog', 'dog', 'hamster'],
Color=['black', 'golden', 'black', 'white', 'golden', 'golden'],
Weight=[40, 15, 20, 80, 25, 1],
Age=[5, 8, 9, 2, 0.5, 3]
)
pets
Species | Color | Weight | Age | |
---|---|---|---|---|
0 | dog | black | 40 | 5.0 |
1 | cat | golden | 15 | 8.0 |
2 | cat | black | 20 | 9.0 |
3 | dog | white | 80 | 2.0 |
4 | dog | golden | 25 | 0.5 |
5 | hamster | golden | 1 | 3.0 |
pets.groupby('Species').mean()
Weight | Age | |
---|---|---|
Species | ||
cat | 17.50 | 8.5 |
dog | 48.33 | 2.5 |
hamster | 1.00 | 3.0 |
requests
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
0 | Balboa Park | Dead Animal | 11 | 0 | 11 |
1 | Balboa Park | Development Services - Code Enforcement | 2 | 0 | 2 |
2 | Balboa Park | Encampment | 215 | 20 | 235 |
... | ... | ... | ... | ... | ... |
1418 | Via De La Valle | Pothole | 11 | 7 | 18 |
1419 | Via De La Valle | Sidewalk Repair Issue | 0 | 1 | 1 |
1420 | Via De La Valle | Street Sweeping | 1 | 0 | 1 |
1421 rows × 5 columns
requests.groupby('neighborhood').sum()
closed | open | total | |
---|---|---|---|
neighborhood | |||
Balboa Park | 1173 | 261 | 1434 |
Barrio Logan | 1185 | 201 | 1386 |
Black Mountain Ranch | 151 | 44 | 195 |
... | ... | ... | ... |
University | 1614 | 620 | 2234 |
Uptown | 5654 | 1691 | 7345 |
Via De La Valle | 13 | 12 | 25 |
57 rows × 3 columns
Our original goal was to find the neighborhood with the most total requests, so after grouping, we need to sort:
# Note the use of .index – remember, the index isn't a column!
(
requests
.groupby('neighborhood')
.sum()
.sort_values(by='total', ascending=False)
.index[0]
)
'Downtown'
.groupby
in general¶In short, .groupby
aggregates all rows with the same value in a specified column (e.g. 'neighborhood'
) into a single row in the resulting DataFrame, using an aggregation method (e.g. .sum()
) to combine values.
.groupby(column_name)
will gather rows which have the same value in the specified column (column_name
).'neighborhood'
.'neighborhood'
..sum()
method to every 'neighborhood'
.'closed'
, 'open'
, and 'total'
). .count()
, .sum()
, .mean()
, .median()
, .max()
, and .min()
.requests
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
0 | Balboa Park | Dead Animal | 11 | 0 | 11 |
1 | Balboa Park | Development Services - Code Enforcement | 2 | 0 | 2 |
2 | Balboa Park | Encampment | 215 | 20 | 235 |
... | ... | ... | ... | ... | ... |
1418 | Via De La Valle | Pothole | 11 | 7 | 18 |
1419 | Via De La Valle | Sidewalk Repair Issue | 0 | 1 | 1 |
1420 | Via De La Valle | Street Sweeping | 1 | 0 | 1 |
1421 rows × 5 columns
requests.groupby('neighborhood').sum()
closed | open | total | |
---|---|---|---|
neighborhood | |||
Balboa Park | 1173 | 261 | 1434 |
Barrio Logan | 1185 | 201 | 1386 |
Black Mountain Ranch | 151 | 44 | 195 |
... | ... | ... | ... |
University | 1614 | 620 | 2234 |
Uptown | 5654 | 1691 | 7345 |
Via De La Valle | 13 | 12 | 25 |
57 rows × 3 columns
The 'service'
column has disappeared. Why?
requests
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
0 | Balboa Park | Dead Animal | 11 | 0 | 11 |
1 | Balboa Park | Development Services - Code Enforcement | 2 | 0 | 2 |
2 | Balboa Park | Encampment | 215 | 20 | 235 |
... | ... | ... | ... | ... | ... |
1418 | Via De La Valle | Pothole | 11 | 7 | 18 |
1419 | Via De La Valle | Sidewalk Repair Issue | 0 | 1 | 1 |
1420 | Via De La Valle | Street Sweeping | 1 | 0 | 1 |
1421 rows × 5 columns
requests.groupby('neighborhood').sum()
closed | open | total | |
---|---|---|---|
neighborhood | |||
Balboa Park | 1173 | 261 | 1434 |
Barrio Logan | 1185 | 201 | 1386 |
Black Mountain Ranch | 151 | 44 | 195 |
... | ... | ... | ... |
University | 1614 | 620 | 2234 |
Uptown | 5654 | 1691 | 7345 |
Via De La Valle | 13 | 12 | 25 |
57 rows × 3 columns
.sum()
, in this case – is applied to each column.'service'
column.# Can you guess how the max is determined?
requests.groupby('neighborhood').max()
service | closed | open | total | |
---|---|---|---|---|
neighborhood | ||||
Balboa Park | Weed Cleanup | 343 | 57 | 368 |
Barrio Logan | Weed Cleanup | 286 | 48 | 317 |
Black Mountain Ranch | Tree Maintenance | 41 | 14 | 44 |
... | ... | ... | ... | ... |
University | Weed Cleanup | 712 | 249 | 961 |
Uptown | Weed Cleanup | 1422 | 384 | 1806 |
Via De La Valle | Street Sweeping | 11 | 7 | 18 |
57 rows × 4 columns
requests.groupby('neighborhood').max()
service | closed | open | total | |
---|---|---|---|---|
neighborhood | ||||
Balboa Park | Weed Cleanup | 343 | 57 | 368 |
Barrio Logan | Weed Cleanup | 286 | 48 | 317 |
Black Mountain Ranch | Tree Maintenance | 41 | 14 | 44 |
... | ... | ... | ... | ... |
University | Weed Cleanup | 712 | 249 | 961 |
Uptown | Weed Cleanup | 1422 | 384 | 1806 |
Via De La Valle | Street Sweeping | 11 | 7 | 18 |
57 rows × 4 columns
Why isn't the 'total'
column equal to the sum of the 'closed'
and 'open'
columns, as it originally was?
# Why don't these numbers match those in the grouped DataFrame?
requests[(requests.get('neighborhood') == 'Balboa Park') & (requests.get('service') == 'Weed Cleanup')]
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
27 | Balboa Park | Weed Cleanup | 1 | 0 | 1 |
How do we find the number of different services requested in each neighborhood?
As always when using groupby
, there are two steps:
'neighborhood'
seems like a good choice..count()
, .sum()
, .mean()
, .median()
, .max()
, and .min()
.# How many different requests are there for the neighborhood 'University'?
requests[requests.get('neighborhood') == 'University']
neighborhood | service | closed | open | total | |
---|---|---|---|---|---|
1354 | University | Dead Animal | 25 | 0 | 25 |
1355 | University | Development Services - Code Enforcement | 7 | 2 | 9 |
1356 | University | Encampment | 55 | 27 | 82 |
... | ... | ... | ... | ... | ... |
1381 | University | Tree Maintenance | 47 | 8 | 55 |
1382 | University | Waste on Private Property | 5 | 1 | 6 |
1383 | University | Weed Cleanup | 1 | 4 | 5 |
30 rows × 5 columns
# How do we find this result for every neighborhood?
The column names of the output of .groupby
don't make sense when using the .count()
aggregation method.
num_diff_services = requests.groupby('neighborhood').count()
num_diff_services
service | closed | open | total | |
---|---|---|---|---|
neighborhood | ||||
Balboa Park | 28 | 28 | 28 | 28 |
Barrio Logan | 28 | 28 | 28 | 28 |
Black Mountain Ranch | 24 | 24 | 24 | 24 |
... | ... | ... | ... | ... |
University | 30 | 30 | 30 | 30 |
Uptown | 31 | 31 | 31 | 31 |
Via De La Valle | 6 | 6 | 6 | 6 |
57 rows × 4 columns
Consider dropping unneeded columns and renaming columns as follows:
.assign
to create a new column containing the same values as the old column(s)..drop(columns=list_of_column_labels)
to drop the old column(s). Alternatively, use .get(list_of_column_labels)
to keep only the columns in the given list. The columns will appear in the order you specify, so this is also useful for reordering columns!num_diff_services = num_diff_services.assign(
count_of_services=num_diff_services.get('open')
).drop(columns=['service', 'closed', 'open', 'total'])
num_diff_services
count_of_services | |
---|---|
neighborhood | |
Balboa Park | 28 |
Barrio Logan | 28 |
Black Mountain Ranch | 24 |
... | ... |
University | 30 |
Uptown | 31 |
Via De La Valle | 6 |
57 rows × 1 columns
We won't cover this section in class. Instead, it's here for you to practice with some harder examples.
The video below walks through the solutions (it's also linked here). You can also see the solutions by clicking the "✅ Click here to see the answer." button below each question.
from IPython.display import YouTubeVideo
YouTubeVideo('xg7rnjWnZ48')
Before watching the video or looking at the solutions, make sure to try these problems on your own – they're great prep for homeworks, projects, and exams! Feel free to ask about them in office hours or on Ed.
imdb = bpd.read_csv('data/imdb.csv').set_index('Title').sort_values(by='Rating')
imdb
Votes | Rating | Year | Decade | |
---|---|---|---|---|
Title | ||||
Akira | 91652 | 8.0 | 1988 | 1980 |
Per un pugno di dollari | 124671 | 8.0 | 1964 | 1960 |
Guardians of the Galaxy | 527349 | 8.0 | 2014 | 2010 |
... | ... | ... | ... | ... |
The Godfather: Part II | 692753 | 9.0 | 1974 | 1970 |
The Shawshank Redemption | 1498733 | 9.2 | 1994 | 1990 |
The Godfather | 1027398 | 9.2 | 1972 | 1970 |
250 rows × 4 columns
imdb.groupby('Decade').count()
Votes | Rating | Year | |
---|---|---|---|
Decade | |||
1920 | 4 | 4 | 4 |
1930 | 7 | 7 | 7 |
1940 | 14 | 14 | 14 |
... | ... | ... | ... |
1990 | 42 | 42 | 42 |
2000 | 50 | 50 | 50 |
2010 | 29 | 29 | 29 |
10 rows × 3 columns
# We'll learn how to make plots like this in the next lecture!
imdb.groupby('Decade').count().plot(y='Year');
Let's try to do this two different ways.
imdb[imdb.get('Decade') == 1990].sort_values('Rating', ascending=False).index[0]
'The Shawshank Redemption'
Note: The command to extract the index of a DataFrame is .index
- no parentheses! This is different than the way we extract columns, with .get()
, because the index is not a column.
imdb.reset_index().groupby('Decade').max()
Title | Votes | Rating | Year | |
---|---|---|---|---|
Decade | ||||
1920 | The Kid | 98794 | 8.3 | 1927 |
1930 | The Wizard of Oz | 259235 | 8.5 | 1939 |
1940 | The Treasure of the Sierra Madre | 350551 | 8.6 | 1949 |
... | ... | ... | ... | ... |
1990 | Unforgiven | 1498733 | 9.2 | 1999 |
2000 | Yip Man | 1473049 | 8.9 | 2009 |
2010 | X-Men: Days of Future Past | 1271949 | 8.7 | 2015 |
10 rows × 4 columns
.max()
), aggregation is done to each column individually. max
is not helpful here.
good_movies_per_year = imdb[imdb.get('Rating') > 8.5].groupby('Year').count() good_movies_per_year[good_movies_per_year.get('Votes') > 3].shape[0]As mentioned below, you can also use:
(good_movies_per_year.get('Votes') > 3).sum()
.sum()
on a boolean array¶True
elements because Python treats True
as 1 and False
as 0.
more_than_3_ix = imdb.groupby('Year').count().get('Votes') > 3 imdb.groupby('Year').mean()[more_than_3_ix].sort_values(by='Rating').index[-1]
Hint: Use .str.len()
on the column or index that contains the names of the movies.
( imdb.assign(title_length=imdb.index.str.len()) .groupby('Year').mean() .sort_values(by='title_length') .index[-1] )The year is 1964 – take a look at the movies from 1964 by querying!
# A Series of Trues and Falses; True when there were at least 3 movies on the list from that year more_than_3_ix = imdb.groupby('Year').count().get('Votes') > 3 # The sum of the ratings of movies from years that had at least 3 movies on the list total_rating = imdb.groupby('Year').sum()[more_than_3_ix].get('Rating').sum() # The total number of movies from years that had at least 3 movies on the list count = imdb.groupby('Year').count()[more_than_3_ix].get('Rating').sum() # The correct answer average_rating = total_rating / count # Close, but incorrect: # Doesn't account for the fact that different years have different numbers of movies on the list close_but_wrong = imdb.groupby('Year').mean()[more_than_3_ix].get('Rating').mean()
&
if you require all to be true, or |
if you require at least one to be true.df.groupby(column_name).agg_method()
aggregates all rows with the same value for column_name
into a single row in the resulting DataFrame, using agg_method()
to combine values..count()
, .sum()
, .mean()
, .median()
, .max()
, and .min()
.A picture is worth a 1000 words – it's time to visualize!