# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
np.set_printoptions(threshold=20, precision=2, suppress=True)
import pandas as pd
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
Recall, the requests DataFrame contains a summary of all service requests so far this year, broken down by neighborhood and service.
requests = bpd.read_csv('data/get-it-done-requests.csv')
requests = requests.assign(total=requests.get('closed') + requests.get('open'))
requests
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 0 | Balboa Park | Dead Animal | 11 | 0 | 11 |
| 1 | Balboa Park | Development Services - Code Enforcement | 2 | 0 | 2 |
| 2 | Balboa Park | Encampment | 215 | 20 | 235 |
| ... | ... | ... | ... | ... | ... |
| 1418 | Via De La Valle | Pothole | 11 | 7 | 18 |
| 1419 | Via De La Valle | Sidewalk Repair Issue | 0 | 1 | 1 |
| 1420 | Via De La Valle | Street Sweeping | 1 | 0 | 1 |
1421 rows × 5 columns
There are several types of comparisons we can make.
| symbol | meaning |
|---|---|
== |
equal to |
!= |
not equal to |
< |
less than |
<= |
less than or equal to |
> |
greater than |
>= |
greater than or equal to |
5 == 6
False
type(5 == 6)
bool
9 + 10 < 21
True
'zebra' == 'zeb' + 'ra'
True
To select only certain rows of requests:
Trues (keep) and Falses (toss), usually by making a comparison.requests[sequence_goes_here].requests
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 0 | Balboa Park | Dead Animal | 11 | 0 | 11 |
| 1 | Balboa Park | Development Services - Code Enforcement | 2 | 0 | 2 |
| 2 | Balboa Park | Encampment | 215 | 20 | 235 |
| ... | ... | ... | ... | ... | ... |
| 1418 | Via De La Valle | Pothole | 11 | 7 | 18 |
| 1419 | Via De La Valle | Sidewalk Repair Issue | 0 | 1 | 1 |
| 1420 | Via De La Valle | Street Sweeping | 1 | 0 | 1 |
1421 rows × 5 columns
# A Boolean Series.
requests.get('closed') > 5
0 True
1 False
2 True
...
1418 True
1419 False
1420 False
Name: closed, Length: 1421, dtype: bool
# A query.
requests[requests.get('closed') > 5]
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 0 | Balboa Park | Dead Animal | 11 | 0 | 11 |
| 2 | Balboa Park | Encampment | 215 | 20 | 235 |
| 3 | Balboa Park | Environmental Services Code Compliance | 8 | 1 | 9 |
| ... | ... | ... | ... | ... | ... |
| 1413 | Uptown | Waste on Private Property | 40 | 2 | 42 |
| 1414 | Uptown | Weed Cleanup | 21 | 3 | 24 |
| 1418 | Via De La Valle | Pothole | 11 | 7 | 18 |
920 rows × 5 columns
'Pothole' requests? 🕳¶Key concept: Querying.
'Pothole' requests.'total' in descending order.'neighborhood' column.# This DataFrame only contains rows where the 'service' is 'Pothole'!
only_potholes = requests[requests.get('service') == 'Pothole']
only_potholes
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 12 | Balboa Park | Pothole | 70 | 8 | 78 |
| 40 | Barrio Logan | Pothole | 111 | 9 | 120 |
| 67 | Black Mountain Ranch | Pothole | 19 | 1 | 20 |
| ... | ... | ... | ... | ... | ... |
| 1366 | University | Pothole | 712 | 249 | 961 |
| 1396 | Uptown | Pothole | 513 | 87 | 600 |
| 1418 | Via De La Valle | Pothole | 11 | 7 | 18 |
56 rows × 5 columns
# You can space your code out like this if needed.
(
only_potholes
.sort_values('total', ascending=False)
.get('neighborhood')
.iloc[0]
)
'Clairemont Mesa'
requests[requests.get('service') == 'Car Maintenance']
| neighborhood | service | closed | open | total |
|---|
Which expression below evaluates to the total number of service requests in the 'Downtown' neighborhood?
A. requests[requests.get('neighborhood') == 'Downtown'].get('total').sum()
B. requests.get('total').sum()[requests.get('neighborhood') == 'Downtown']
C. requests['Downtown'].get('total').sum()
D. More than one of the above.
...
Ellipsis
Question: What is the most commonly requested service in the 'University' neighborhood (near UCSD)?
Write one line of code that evaluates to the answer.
...
Ellipsis
'Pothole' or 'Dead Animal'?¶Key concept: Queries with multiple conditions.
& for "and" and | for "or".&: All conditions must be true.|: At least one condition must be true.(parentheses) around each condition!and and or here! They do not behave as you'd want.requests[(requests.get('service') == 'Pothole') | (requests.get('service') == 'Dead Animal')]
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 0 | Balboa Park | Dead Animal | 11 | 0 | 11 |
| 12 | Balboa Park | Pothole | 70 | 8 | 78 |
| 28 | Barrio Logan | Dead Animal | 2 | 0 | 2 |
| ... | ... | ... | ... | ... | ... |
| 1396 | Uptown | Pothole | 513 | 87 | 600 |
| 1415 | Via De La Valle | Dead Animal | 1 | 0 | 1 |
| 1418 | Via De La Valle | Pothole | 11 | 7 | 18 |
109 rows × 5 columns
# You can add line breaks within brackets or parentheses.
requests[(requests.get('service') == 'Pothole') |
(requests.get('service') == 'Dead Animal')]
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 0 | Balboa Park | Dead Animal | 11 | 0 | 11 |
| 12 | Balboa Park | Pothole | 70 | 8 | 78 |
| 28 | Barrio Logan | Dead Animal | 2 | 0 | 2 |
| ... | ... | ... | ... | ... | ... |
| 1396 | Uptown | Pothole | 513 | 87 | 600 |
| 1415 | Via De La Valle | Dead Animal | 1 | 0 | 1 |
| 1418 | Via De La Valle | Pothole | 11 | 7 | 18 |
109 rows × 5 columns
& and | operators work element-wise!¶(requests.get('service') == 'Pothole')
0 False
1 False
2 False
...
1418 True
1419 False
1420 False
Name: service, Length: 1421, dtype: bool
(requests.get('service') == 'Dead Animal')
0 True
1 False
2 False
...
1418 False
1419 False
1420 False
Name: service, Length: 1421, dtype: bool
(requests.get('service') == 'Pothole') | (requests.get('service') == 'Dead Animal')
0 True
1 False
2 False
...
1418 True
1419 False
1420 False
Name: service, Length: 1421, dtype: bool
'Pothole' or 'Dead Animal'?¶requests[(requests.get('service') == 'Pothole') |
(requests.get('service') == 'Dead Animal')].get('total').sum()
18512
Each of the following questions can be answered by querying the requests DataFrame.
'Street Flooded' requests?'Kearny Mesa' neighborhood, how many different types of services have open requests?'La Jolla' neighborhood?How many of the questions above require the query to have multiple conditions?
A. 0 B. 1 C. 2 D. 3
Bonus: Try to write code to answer each question.
...
Ellipsis
.take to select rows by position¶.take([list_of_integer_positions]). This keeps only the rows whose positions are in the specified list..iloc[] on a Series.requests
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 0 | Balboa Park | Dead Animal | 11 | 0 | 11 |
| 1 | Balboa Park | Development Services - Code Enforcement | 2 | 0 | 2 |
| 2 | Balboa Park | Encampment | 215 | 20 | 235 |
| ... | ... | ... | ... | ... | ... |
| 1418 | Via De La Valle | Pothole | 11 | 7 | 18 |
| 1419 | Via De La Valle | Sidewalk Repair Issue | 0 | 1 | 1 |
| 1420 | Via De La Valle | Street Sweeping | 1 | 0 | 1 |
1421 rows × 5 columns
requests.take([1, 3, 5])
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 1 | Balboa Park | Development Services - Code Enforcement | 2 | 0 | 2 |
| 3 | Balboa Park | Environmental Services Code Compliance | 8 | 1 | 9 |
| 5 | Balboa Park | Graffiti - Public | 343 | 25 | 368 |
requests.get('service').iloc[[1, 3, 5]]
1 Development Services - Code Enforcement 3 Environmental Services Code Compliance 5 Graffiti - Public Name: service, dtype: object
requests.take(np.arange(5))
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 0 | Balboa Park | Dead Animal | 11 | 0 | 11 |
| 1 | Balboa Park | Development Services - Code Enforcement | 2 | 0 | 2 |
| 2 | Balboa Park | Encampment | 215 | 20 | 235 |
| 3 | Balboa Park | Environmental Services Code Compliance | 8 | 1 | 9 |
| 4 | Balboa Park | Graffiti - Code Enforcement | 2 | 1 | 3 |
Key concept: Grouping by one column.
We can find the total number of Get It Done requests in any one neighborhood using the tools we already have.
requests[requests.get('neighborhood') == 'Black Mountain Ranch'].get('total').sum()
195
requests[requests.get('neighborhood') == 'Uptown'].get('total').sum()
7345
If we wanted to find the total number of requests in every neighborhood, this would be quite inconvenient... there has to be a better way!
.groupby¶Observe what happens when we use the .groupby method on requests with the argument 'neighborhood'.
requests.groupby('neighborhood').sum()
| closed | open | total | |
|---|---|---|---|
| neighborhood | |||
| Balboa Park | 1173 | 261 | 1434 |
| Barrio Logan | 1185 | 201 | 1386 |
| Black Mountain Ranch | 151 | 44 | 195 |
| ... | ... | ... | ... |
| University | 1614 | 620 | 2234 |
| Uptown | 5654 | 1691 | 7345 |
| Via De La Valle | 13 | 12 | 25 |
57 rows × 3 columns
Note that the 'total' counts for Black Mountain Ranch and Uptown are the same as we saw on the previous slide. What just happened? 🤯
Consider the DataFrame pets, shown below.
| Species | Color | Weight | Age | |
|---|---|---|---|---|
| 0 | dog | black | 40 | 5.0 |
| 1 | cat | golden | 15 | 8.0 |
| 2 | cat | black | 20 | 9.0 |
| 3 | dog | white | 80 | 2.0 |
| 4 | dog | golden | 25 | 0.5 |
| 5 | hamster | golden | 1 | 3.0 |
When we run pets.groupby('Species').mean(), babypandas does three things under the hood.
First, it splits the rows of pets into "groups" according to their values in the 'Species' column.
| Species | Color | Weight | Age | |
|---|---|---|---|---|
| 0 | dog | black | 40 | 5.0 |
| 3 | dog | white | 80 | 2.0 |
| 4 | dog | golden | 25 | 0.5 |
| Species | Color | Weight | Age | |
|---|---|---|---|---|
| 1 | cat | golden | 15 | 8.0 |
| 2 | cat | black | 20 | 9.0 |
| Species | Color | Weight | Age | |
|---|---|---|---|---|
| 5 | hamster | golden | 1 | 3.0 |
Then, it aggregates the rows with the same value of 'Species' by taking the mean of all numerical columns.
| Weight | Age | |
|---|---|---|
| dog | 48.33 | 2.5 |
| Weight | Age | |
|---|---|---|
| cat | 17.5 | 8.5 |
| Weight | Age | |
|---|---|---|
| hamster | 1.0 | 3.0 |
</div>
Finally, it combines these means into a new DataFrame that is indexed by 'Species' and sorted by 'Species' in ascending order.
| Weight | Age | |
|---|---|---|
| Species | ||
| cat | 17.50 | 8.5 |
| dog | 48.33 | 2.5 |
| hamster | 1.00 | 3.0 |
pets = bpd.DataFrame().assign(
Species=['dog', 'cat', 'cat', 'dog', 'dog', 'hamster'],
Color=['black', 'golden', 'black', 'white', 'golden', 'golden'],
Weight=[40, 15, 20, 80, 25, 1],
Age=[5, 8, 9, 2, 0.5, 3]
)
pets
| Species | Color | Weight | Age | |
|---|---|---|---|---|
| 0 | dog | black | 40 | 5.0 |
| 1 | cat | golden | 15 | 8.0 |
| 2 | cat | black | 20 | 9.0 |
| 3 | dog | white | 80 | 2.0 |
| 4 | dog | golden | 25 | 0.5 |
| 5 | hamster | golden | 1 | 3.0 |
pets.groupby('Species').mean()
| Weight | Age | |
|---|---|---|
| Species | ||
| cat | 17.50 | 8.5 |
| dog | 48.33 | 2.5 |
| hamster | 1.00 | 3.0 |
requests
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 0 | Balboa Park | Dead Animal | 11 | 0 | 11 |
| 1 | Balboa Park | Development Services - Code Enforcement | 2 | 0 | 2 |
| 2 | Balboa Park | Encampment | 215 | 20 | 235 |
| ... | ... | ... | ... | ... | ... |
| 1418 | Via De La Valle | Pothole | 11 | 7 | 18 |
| 1419 | Via De La Valle | Sidewalk Repair Issue | 0 | 1 | 1 |
| 1420 | Via De La Valle | Street Sweeping | 1 | 0 | 1 |
1421 rows × 5 columns
requests.groupby('neighborhood').sum()
| closed | open | total | |
|---|---|---|---|
| neighborhood | |||
| Balboa Park | 1173 | 261 | 1434 |
| Barrio Logan | 1185 | 201 | 1386 |
| Black Mountain Ranch | 151 | 44 | 195 |
| ... | ... | ... | ... |
| University | 1614 | 620 | 2234 |
| Uptown | 5654 | 1691 | 7345 |
| Via De La Valle | 13 | 12 | 25 |
57 rows × 3 columns
Our original goal was to find the neighborhood with the most total requests, so after grouping, we need to sort:
# Note the use of .index – remember, the index isn't a column!
(
requests
.groupby('neighborhood')
.sum()
.sort_values(by='total', ascending=False)
.index[0]
)
'Downtown'
.groupby in general¶In short, .groupby aggregates all rows with the same value in a specified column (e.g. 'neighborhood') into a single row in the resulting DataFrame, using an aggregation method (e.g. .sum()) to combine values.
.groupby(column_name) will gather rows which have the same value in the specified column (column_name).'neighborhood'.'neighborhood'..sum() method to every 'neighborhood'.'closed', 'open', and 'total'). .count(), .sum(), .mean(), .median(), .max(), and .min().requests
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 0 | Balboa Park | Dead Animal | 11 | 0 | 11 |
| 1 | Balboa Park | Development Services - Code Enforcement | 2 | 0 | 2 |
| 2 | Balboa Park | Encampment | 215 | 20 | 235 |
| ... | ... | ... | ... | ... | ... |
| 1418 | Via De La Valle | Pothole | 11 | 7 | 18 |
| 1419 | Via De La Valle | Sidewalk Repair Issue | 0 | 1 | 1 |
| 1420 | Via De La Valle | Street Sweeping | 1 | 0 | 1 |
1421 rows × 5 columns
requests.groupby('neighborhood').sum()
| closed | open | total | |
|---|---|---|---|
| neighborhood | |||
| Balboa Park | 1173 | 261 | 1434 |
| Barrio Logan | 1185 | 201 | 1386 |
| Black Mountain Ranch | 151 | 44 | 195 |
| ... | ... | ... | ... |
| University | 1614 | 620 | 2234 |
| Uptown | 5654 | 1691 | 7345 |
| Via De La Valle | 13 | 12 | 25 |
57 rows × 3 columns
The 'service' column has disappeared. Why?
requests
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 0 | Balboa Park | Dead Animal | 11 | 0 | 11 |
| 1 | Balboa Park | Development Services - Code Enforcement | 2 | 0 | 2 |
| 2 | Balboa Park | Encampment | 215 | 20 | 235 |
| ... | ... | ... | ... | ... | ... |
| 1418 | Via De La Valle | Pothole | 11 | 7 | 18 |
| 1419 | Via De La Valle | Sidewalk Repair Issue | 0 | 1 | 1 |
| 1420 | Via De La Valle | Street Sweeping | 1 | 0 | 1 |
1421 rows × 5 columns
requests.groupby('neighborhood').sum()
| closed | open | total | |
|---|---|---|---|
| neighborhood | |||
| Balboa Park | 1173 | 261 | 1434 |
| Barrio Logan | 1185 | 201 | 1386 |
| Black Mountain Ranch | 151 | 44 | 195 |
| ... | ... | ... | ... |
| University | 1614 | 620 | 2234 |
| Uptown | 5654 | 1691 | 7345 |
| Via De La Valle | 13 | 12 | 25 |
57 rows × 3 columns
.sum(), in this case – is applied to each column.'service' column.# Can you guess how the max is determined?
requests.groupby('neighborhood').max()
| service | closed | open | total | |
|---|---|---|---|---|
| neighborhood | ||||
| Balboa Park | Weed Cleanup | 343 | 57 | 368 |
| Barrio Logan | Weed Cleanup | 286 | 48 | 317 |
| Black Mountain Ranch | Tree Maintenance | 41 | 14 | 44 |
| ... | ... | ... | ... | ... |
| University | Weed Cleanup | 712 | 249 | 961 |
| Uptown | Weed Cleanup | 1422 | 384 | 1806 |
| Via De La Valle | Street Sweeping | 11 | 7 | 18 |
57 rows × 4 columns
requests.groupby('neighborhood').max()
| service | closed | open | total | |
|---|---|---|---|---|
| neighborhood | ||||
| Balboa Park | Weed Cleanup | 343 | 57 | 368 |
| Barrio Logan | Weed Cleanup | 286 | 48 | 317 |
| Black Mountain Ranch | Tree Maintenance | 41 | 14 | 44 |
| ... | ... | ... | ... | ... |
| University | Weed Cleanup | 712 | 249 | 961 |
| Uptown | Weed Cleanup | 1422 | 384 | 1806 |
| Via De La Valle | Street Sweeping | 11 | 7 | 18 |
57 rows × 4 columns
Why isn't the 'total' column equal to the sum of the 'closed' and 'open' columns, as it originally was?
# Why don't these numbers match those in the grouped DataFrame?
requests[(requests.get('neighborhood') == 'Balboa Park') & (requests.get('service') == 'Weed Cleanup')]
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 27 | Balboa Park | Weed Cleanup | 1 | 0 | 1 |
How do we find the number of different services requested in each neighborhood?
As always when using groupby, there are two steps:
'neighborhood' seems like a good choice..count(), .sum(), .mean(), .median(), .max(), and .min().# How many different requests are there for the neighborhood 'University'?
requests[requests.get('neighborhood') == 'University']
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 1354 | University | Dead Animal | 25 | 0 | 25 |
| 1355 | University | Development Services - Code Enforcement | 7 | 2 | 9 |
| 1356 | University | Encampment | 55 | 27 | 82 |
| ... | ... | ... | ... | ... | ... |
| 1381 | University | Tree Maintenance | 47 | 8 | 55 |
| 1382 | University | Waste on Private Property | 5 | 1 | 6 |
| 1383 | University | Weed Cleanup | 1 | 4 | 5 |
30 rows × 5 columns
# How do we find this result for every neighborhood?
The column names of the output of .groupby don't make sense when using the .count() aggregation method.
num_diff_services = requests.groupby('neighborhood').count()
num_diff_services
| service | closed | open | total | |
|---|---|---|---|---|
| neighborhood | ||||
| Balboa Park | 28 | 28 | 28 | 28 |
| Barrio Logan | 28 | 28 | 28 | 28 |
| Black Mountain Ranch | 24 | 24 | 24 | 24 |
| ... | ... | ... | ... | ... |
| University | 30 | 30 | 30 | 30 |
| Uptown | 31 | 31 | 31 | 31 |
| Via De La Valle | 6 | 6 | 6 | 6 |
57 rows × 4 columns
Consider dropping unneeded columns and renaming columns as follows:
.assign to create a new column containing the same values as the old column(s)..drop(columns=list_of_column_labels) to drop the old column(s). Alternatively, use .get(list_of_column_labels) to keep only the columns in the given list. The columns will appear in the order you specify, so this is also useful for reordering columns!num_diff_services = num_diff_services.assign(
count_of_services=num_diff_services.get('open')
).drop(columns=['service', 'closed', 'open', 'total'])
num_diff_services
| count_of_services | |
|---|---|
| neighborhood | |
| Balboa Park | 28 |
| Barrio Logan | 28 |
| Black Mountain Ranch | 24 |
| ... | ... |
| University | 30 |
| Uptown | 31 |
| Via De La Valle | 6 |
57 rows × 1 columns
We won't cover this section in class. Instead, it's here for you to practice with some harder examples.
The video below walks through the solutions (it's also linked here). You can also see the solutions by clicking the "✅ Click here to see the answer." button below each question.
from IPython.display import YouTubeVideo
YouTubeVideo('xg7rnjWnZ48')
Before watching the video or looking at the solutions, make sure to try these problems on your own – they're great prep for homeworks, projects, and exams! Feel free to ask about them in office hours or on Ed.
imdb = bpd.read_csv('data/imdb.csv').set_index('Title').sort_values(by='Rating')
imdb
| Votes | Rating | Year | Decade | |
|---|---|---|---|---|
| Title | ||||
| Akira | 91652 | 8.0 | 1988 | 1980 |
| Per un pugno di dollari | 124671 | 8.0 | 1964 | 1960 |
| Guardians of the Galaxy | 527349 | 8.0 | 2014 | 2010 |
| ... | ... | ... | ... | ... |
| The Godfather: Part II | 692753 | 9.0 | 1974 | 1970 |
| The Shawshank Redemption | 1498733 | 9.2 | 1994 | 1990 |
| The Godfather | 1027398 | 9.2 | 1972 | 1970 |
250 rows × 4 columns
imdb.groupby('Decade').count()
| Votes | Rating | Year | |
|---|---|---|---|
| Decade | |||
| 1920 | 4 | 4 | 4 |
| 1930 | 7 | 7 | 7 |
| 1940 | 14 | 14 | 14 |
| ... | ... | ... | ... |
| 1990 | 42 | 42 | 42 |
| 2000 | 50 | 50 | 50 |
| 2010 | 29 | 29 | 29 |
10 rows × 3 columns
# We'll learn how to make plots like this in the next lecture!
imdb.groupby('Decade').count().plot(y='Year');
Let's try to do this two different ways.
imdb[imdb.get('Decade') == 1990].sort_values('Rating', ascending=False).index[0]
'The Shawshank Redemption'
Note: The command to extract the index of a DataFrame is .index - no parentheses! This is different than the way we extract columns, with .get(), because the index is not a column.
imdb.reset_index().groupby('Decade').max()
| Title | Votes | Rating | Year | |
|---|---|---|---|---|
| Decade | ||||
| 1920 | The Kid | 98794 | 8.3 | 1927 |
| 1930 | The Wizard of Oz | 259235 | 8.5 | 1939 |
| 1940 | The Treasure of the Sierra Madre | 350551 | 8.6 | 1949 |
| ... | ... | ... | ... | ... |
| 1990 | Unforgiven | 1498733 | 9.2 | 1999 |
| 2000 | Yip Man | 1473049 | 8.9 | 2009 |
| 2010 | X-Men: Days of Future Past | 1271949 | 8.7 | 2015 |
10 rows × 4 columns
.max()), aggregation is done to each column individually. max is not helpful here.
good_movies_per_year = imdb[imdb.get('Rating') > 8.5].groupby('Year').count()
good_movies_per_year[good_movies_per_year.get('Votes') > 3].shape[0]
As mentioned below, you can also use:
(good_movies_per_year.get('Votes') > 3).sum()
.sum() on a boolean array¶True elements because Python treats True as 1 and False as 0.
more_than_3_ix = imdb.groupby('Year').count().get('Votes') > 3
imdb.groupby('Year').mean()[more_than_3_ix].sort_values(by='Rating').index[-1]
Hint: Use .str.len() on the column or index that contains the names of the movies.
(
imdb.assign(title_length=imdb.index.str.len())
.groupby('Year').mean()
.sort_values(by='title_length')
.index[-1]
)
The year is 1964 – take a look at the movies from 1964 by querying!
# A Series of Trues and Falses; True when there were at least 3 movies on the list from that year
more_than_3_ix = imdb.groupby('Year').count().get('Votes') > 3
# The sum of the ratings of movies from years that had at least 3 movies on the list
total_rating = imdb.groupby('Year').sum()[more_than_3_ix].get('Rating').sum()
# The total number of movies from years that had at least 3 movies on the list
count = imdb.groupby('Year').count()[more_than_3_ix].get('Rating').sum()
# The correct answer
average_rating = total_rating / count
# Close, but incorrect:
# Doesn't account for the fact that different years have different numbers of movies on the list
close_but_wrong = imdb.groupby('Year').mean()[more_than_3_ix].get('Rating').mean()
& if you require all to be true, or | if you require at least one to be true.df.groupby(column_name).agg_method() aggregates all rows with the same value for column_name into a single row in the resulting DataFrame, using agg_method() to combine values..count(), .sum(), .mean(), .median(), .max(), and .min().A picture is worth a 1000 words – it's time to visualize!