# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
%reload_ext pandas_tutor
%set_pandas_tutor_options {'projectorMode': True}
import matplotlib.pyplot as plt
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats("svg")
plt.style.use('ggplot')
The requests DataFrame contains a summary of all service requests so far in 2022, broken down by neighborhood and service.
requests = bpd.read_csv('data/get-it-done-requests.csv')
requests = requests.assign(total=requests.get('closed') + requests.get('open'))
requests
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 0 | Balboa Park | Dead Animal | 4 | 0 | 4 |
| 1 | Balboa Park | Encampment | 22 | 16 | 38 |
| 2 | Balboa Park | Environmental Services Code Compliance | 0 | 1 | 1 |
| 3 | Balboa Park | Graffiti - Code Enforcement | 0 | 1 | 1 |
| 4 | Balboa Park | Graffiti - Public | 62 | 37 | 99 |
| ... | ... | ... | ... | ... | ... |
| 1057 | Uptown | Tree Maintenance | 19 | 29 | 48 |
| 1058 | Uptown | Vegetation Encroachment | 1 | 2 | 3 |
| 1059 | Uptown | Waste on Private Property | 3 | 1 | 4 |
| 1060 | Uptown | Weed Cleanup | 1 | 0 | 1 |
| 1061 | Via De La Valle | Pothole | 0 | 2 | 2 |
1062 rows × 5 columns
To select only certain rows of requests:
Trues (keep) and Falses (toss), usually by making a comparison.requests[sequence_goes_here].There are several types of comparisons we can make.
| symbol | meaning |
|---|---|
== |
equal to |
!= |
not equal to |
< |
less than |
<= |
less than or equal to |
> |
greater than |
>= |
greater than or equal to |
'Tree Maintenance' requests? 🌳¶Key concept: Querying
'Tree Maintenance' requests.'total' in descending order.'neighborhood' column.tree_maintenance_only = requests[requests.get('service') == 'Tree Maintenance']
tree_maintenance_only
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 21 | Balboa Park | Tree Maintenance | 2 | 7 | 9 |
| 41 | Barrio Logan | Tree Maintenance | 1 | 2 | 3 |
| 57 | Black Mountain Ranch | Tree Maintenance | 2 | 0 | 2 |
| 95 | Carmel Valley | Tree Maintenance | 6 | 2 | 8 |
| 122 | Clairemont Mesa | Tree Maintenance | 14 | 15 | 29 |
| ... | ... | ... | ... | ... | ... |
| 974 | Torrey Highlands | Tree Maintenance | 0 | 1 | 1 |
| 985 | Torrey Hills | Tree Maintenance | 1 | 1 | 2 |
| 1006 | Torrey Pines | Tree Maintenance | 1 | 2 | 3 |
| 1029 | University | Tree Maintenance | 5 | 9 | 14 |
| 1057 | Uptown | Tree Maintenance | 19 | 29 | 48 |
43 rows × 5 columns
tree_maintenance_sorted = tree_maintenance_only.sort_values(by='total', ascending=False)
tree_maintenance_sorted
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 935 | Southeastern San Diego | Tree Maintenance | 12 | 44 | 56 |
| 596 | North Park | Tree Maintenance | 24 | 30 | 54 |
| 1057 | Uptown | Tree Maintenance | 19 | 29 | 48 |
| 179 | Downtown | Tree Maintenance | 15 | 28 | 43 |
| 712 | Pacific Beach | Tree Maintenance | 22 | 15 | 37 |
| ... | ... | ... | ... | ... | ... |
| 497 | Mission Bay Park | Tree Maintenance | 0 | 1 | 1 |
| 963 | Tijuana River Valley | Tree Maintenance | 0 | 1 | 1 |
| 974 | Torrey Highlands | Tree Maintenance | 0 | 1 | 1 |
| 638 | Old Town San Diego | Tree Maintenance | 1 | 0 | 1 |
| 518 | Mission Beach | Tree Maintenance | 1 | 0 | 1 |
43 rows × 5 columns
tree_maintenance_sorted.get('neighborhood').iloc[0]
'Southeastern San Diego'
requests[requests.get('service') == 'Car Maintenance']
| neighborhood | service | closed | open | total |
|---|
Which expression below evaluates to the total number of service requests in the 'Downtown' neighborhood?
A. requests[requests.get('neighborhood') == 'Downtown'].get('total').sum()
B. requests.get('total').sum()[requests.get('neighborhood') == 'Downtown']
C. requests['Downtown'].get('total').sum()
D. More than one of the above.
...
Ellipsis
Question: What is the most commonly requested service in the 'University' neighborhood (near UCSD)?
Write one line of code that evaluates to the answer.
...
Ellipsis
'Pothole' or 'Pavement Maintenance'?¶Key concept: Queries with multiple conditions.
requests[(requests.get('service') == 'Pothole') | (requests.get('service') == 'Pavement Maintenance')]
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 9 | Balboa Park | Pavement Maintenance | 0 | 1 | 1 |
| 10 | Balboa Park | Pothole | 12 | 9 | 21 |
| 29 | Barrio Logan | Pavement Maintenance | 2 | 3 | 5 |
| 30 | Barrio Logan | Pothole | 1 | 8 | 9 |
| 65 | Carmel Mountain Ranch | Pavement Maintenance | 0 | 1 | 1 |
| ... | ... | ... | ... | ... | ... |
| 1015 | University | Pavement Maintenance | 0 | 1 | 1 |
| 1016 | University | Pothole | 15 | 101 | 116 |
| 1043 | Uptown | Pavement Maintenance | 1 | 9 | 10 |
| 1044 | Uptown | Pothole | 15 | 93 | 108 |
| 1061 | Via De La Valle | Pothole | 0 | 2 | 2 |
85 rows × 5 columns
# You can add line breaks within brackets or parentheses
requests[(requests.get('service') == 'Pothole') |
(requests.get('service') == 'Pavement Maintenance')]
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 9 | Balboa Park | Pavement Maintenance | 0 | 1 | 1 |
| 10 | Balboa Park | Pothole | 12 | 9 | 21 |
| 29 | Barrio Logan | Pavement Maintenance | 2 | 3 | 5 |
| 30 | Barrio Logan | Pothole | 1 | 8 | 9 |
| 65 | Carmel Mountain Ranch | Pavement Maintenance | 0 | 1 | 1 |
| ... | ... | ... | ... | ... | ... |
| 1015 | University | Pavement Maintenance | 0 | 1 | 1 |
| 1016 | University | Pothole | 15 | 101 | 116 |
| 1043 | Uptown | Pavement Maintenance | 1 | 9 | 10 |
| 1044 | Uptown | Pothole | 15 | 93 | 108 |
| 1061 | Via De La Valle | Pothole | 0 | 2 | 2 |
85 rows × 5 columns
& and | operators work element-wise¶(requests.get('service') == 'Pothole')
0 False
1 False
2 False
3 False
4 False
...
1057 False
1058 False
1059 False
1060 False
1061 True
Name: service, Length: 1062, dtype: bool
(requests.get('service') == 'Pavement Maintenance')
0 False
1 False
2 False
3 False
4 False
...
1057 False
1058 False
1059 False
1060 False
1061 False
Name: service, Length: 1062, dtype: bool
(requests.get('service') == 'Pothole') | (requests.get('service') == 'Pavement Maintenance')
0 False
1 False
2 False
3 False
4 False
...
1057 False
1058 False
1059 False
1060 False
1061 True
Name: service, Length: 1062, dtype: bool
'Pothole' or 'Pavement Maintenance'?¶requests[(requests.get('service') == 'Pothole') |
(requests.get('service') == 'Pavement Maintenance')].get('total').sum()
2805
Each of the following questions can be answered by querying the requests DataFrame.
'Street Flooded' requests?'Kearny Mesa' neighborhood, how many different types of services have open requests?'La Jolla' neighborhood?How many of the questions above require the query to have multiple conditions?
A. 0 B. 1 C. 2 D. 3
Bonus: Try to write the code to answer each question.
...
Ellipsis
.take¶.take([list_of_integer_positions]). This keeps only the rows whose positions are in the specified list..iloc[] on a Series.requests.take([1, 3, 5])
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 1 | Balboa Park | Encampment | 22 | 16 | 38 |
| 3 | Balboa Park | Graffiti - Code Enforcement | 0 | 1 | 1 |
| 5 | Balboa Park | Illegal Dumping | 3 | 4 | 7 |
requests.take(np.arange(5))
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 0 | Balboa Park | Dead Animal | 4 | 0 | 4 |
| 1 | Balboa Park | Encampment | 22 | 16 | 38 |
| 2 | Balboa Park | Environmental Services Code Compliance | 0 | 1 | 1 |
| 3 | Balboa Park | Graffiti - Code Enforcement | 0 | 1 | 1 |
| 4 | Balboa Park | Graffiti - Public | 62 | 37 | 99 |
Key concept: Grouping by one column.
requests[requests.get('neighborhood') == 'Carmel Valley'].get('total').sum().requests[requests.get('neighborhood') == 'Carmel Valley'].get('total').sum()
144
requests[requests.get('neighborhood') == 'Uptown'].get('total').sum()
1128
It seems like there has to be a better way. And there is!
Observe what happens when we use the .groupby method on requests with the argument 'neighborhood'.
requests.groupby('neighborhood').sum()
| closed | open | total | |
|---|---|---|---|
| neighborhood | |||
| Balboa Park | 163 | 137 | 300 |
| Barrio Logan | 116 | 91 | 207 |
| Black Mountain Ranch | 27 | 24 | 51 |
| Carmel Mountain Ranch | 14 | 83 | 97 |
| Carmel Valley | 83 | 61 | 144 |
| ... | ... | ... | ... |
| Torrey Hills | 22 | 21 | 43 |
| Torrey Pines | 39 | 64 | 103 |
| University | 114 | 214 | 328 |
| Uptown | 592 | 536 | 1128 |
| Via De La Valle | 0 | 2 | 2 |
57 rows × 3 columns
Note that the 'total' counts for Carmel Valley and Torrey Hills are the same as we saw on the previous slide. What just happened? 🤯
Consider the DataFrame pets shown below.
pets = bpd.DataFrame().assign(
Species=['dog', 'cat', 'cat', 'dog', 'dog', 'hamster'],
Color=['black', 'golden', 'black', 'white', 'golden', 'golden'],
Weight=[40, 15, 20, 80, 25, 1],
Age=[5, 8, 9, 2, 0.5, 3]
)
pets
| Species | Color | Weight | Age | |
|---|---|---|---|---|
| 0 | dog | black | 40 | 5.0 |
| 1 | cat | golden | 15 | 8.0 |
| 2 | cat | black | 20 | 9.0 |
| 3 | dog | white | 80 | 2.0 |
| 4 | dog | golden | 25 | 0.5 |
| 5 | hamster | golden | 1 | 3.0 |
pets.groupby('Species').mean()¶pets into "groups" according to their values in the 'Species' column.'Species' by taking the mean of all numerical columns.'Species' and sorted by 'Species' in ascending order.Note that the result contains just one row for cats, one row for dogs, and one row for hamsters!
%%pt
pets.groupby('Species').mean()
pandas (and babypandas) code.%%pt to the top of a code cell to explain the last line of babypandas code.# Without Pandas Tutor
pets.groupby('Species').mean()
| Weight | Age | |
|---|---|---|
| Species | ||
| cat | 17.500000 | 8.5 |
| dog | 48.333333 | 2.5 |
| hamster | 1.000000 | 3.0 |
%%pt
# With Pandas Tutor
pets.groupby('Species').mean()
requests
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 0 | Balboa Park | Dead Animal | 4 | 0 | 4 |
| 1 | Balboa Park | Encampment | 22 | 16 | 38 |
| 2 | Balboa Park | Environmental Services Code Compliance | 0 | 1 | 1 |
| 3 | Balboa Park | Graffiti - Code Enforcement | 0 | 1 | 1 |
| 4 | Balboa Park | Graffiti - Public | 62 | 37 | 99 |
| ... | ... | ... | ... | ... | ... |
| 1057 | Uptown | Tree Maintenance | 19 | 29 | 48 |
| 1058 | Uptown | Vegetation Encroachment | 1 | 2 | 3 |
| 1059 | Uptown | Waste on Private Property | 3 | 1 | 4 |
| 1060 | Uptown | Weed Cleanup | 1 | 0 | 1 |
| 1061 | Via De La Valle | Pothole | 0 | 2 | 2 |
1062 rows × 5 columns
requests.groupby('neighborhood').sum()
| closed | open | total | |
|---|---|---|---|
| neighborhood | |||
| Balboa Park | 163 | 137 | 300 |
| Barrio Logan | 116 | 91 | 207 |
| Black Mountain Ranch | 27 | 24 | 51 |
| Carmel Mountain Ranch | 14 | 83 | 97 |
| Carmel Valley | 83 | 61 | 144 |
| ... | ... | ... | ... |
| Torrey Hills | 22 | 21 | 43 |
| Torrey Pines | 39 | 64 | 103 |
| University | 114 | 214 | 328 |
| Uptown | 592 | 536 | 1128 |
| Via De La Valle | 0 | 2 | 2 |
57 rows × 3 columns
.groupby in general¶In short, .groupby aggregates all rows with the same value in a specified column (e.g. 'neighborhood') into a single row in the resulting DataFrame, using an aggregation method (e.g. .sum()) to combine values.
.groupby(column_name) will gather rows which have the same value in the specified column (column_name).'neighborhood'.'neighborhood'..sum() method to every 'neighborhood'.'closed', 'open', and 'total'). .count(), .sum(), .mean(), .median(), .max(), and .min().requests
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 0 | Balboa Park | Dead Animal | 4 | 0 | 4 |
| 1 | Balboa Park | Encampment | 22 | 16 | 38 |
| 2 | Balboa Park | Environmental Services Code Compliance | 0 | 1 | 1 |
| 3 | Balboa Park | Graffiti - Code Enforcement | 0 | 1 | 1 |
| 4 | Balboa Park | Graffiti - Public | 62 | 37 | 99 |
| ... | ... | ... | ... | ... | ... |
| 1057 | Uptown | Tree Maintenance | 19 | 29 | 48 |
| 1058 | Uptown | Vegetation Encroachment | 1 | 2 | 3 |
| 1059 | Uptown | Waste on Private Property | 3 | 1 | 4 |
| 1060 | Uptown | Weed Cleanup | 1 | 0 | 1 |
| 1061 | Via De La Valle | Pothole | 0 | 2 | 2 |
1062 rows × 5 columns
requests.groupby('neighborhood').sum()
| closed | open | total | |
|---|---|---|---|
| neighborhood | |||
| Balboa Park | 163 | 137 | 300 |
| Barrio Logan | 116 | 91 | 207 |
| Black Mountain Ranch | 27 | 24 | 51 |
| Carmel Mountain Ranch | 14 | 83 | 97 |
| Carmel Valley | 83 | 61 | 144 |
| ... | ... | ... | ... |
| Torrey Hills | 22 | 21 | 43 |
| Torrey Pines | 39 | 64 | 103 |
| University | 114 | 214 | 328 |
| Uptown | 592 | 536 | 1128 |
| Via De La Valle | 0 | 2 | 2 |
57 rows × 3 columns
The 'service' column has disappeared. Why?
requests
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 0 | Balboa Park | Dead Animal | 4 | 0 | 4 |
| 1 | Balboa Park | Encampment | 22 | 16 | 38 |
| 2 | Balboa Park | Environmental Services Code Compliance | 0 | 1 | 1 |
| 3 | Balboa Park | Graffiti - Code Enforcement | 0 | 1 | 1 |
| 4 | Balboa Park | Graffiti - Public | 62 | 37 | 99 |
| ... | ... | ... | ... | ... | ... |
| 1057 | Uptown | Tree Maintenance | 19 | 29 | 48 |
| 1058 | Uptown | Vegetation Encroachment | 1 | 2 | 3 |
| 1059 | Uptown | Waste on Private Property | 3 | 1 | 4 |
| 1060 | Uptown | Weed Cleanup | 1 | 0 | 1 |
| 1061 | Via De La Valle | Pothole | 0 | 2 | 2 |
1062 rows × 5 columns
requests.groupby('neighborhood').sum()
| closed | open | total | |
|---|---|---|---|
| neighborhood | |||
| Balboa Park | 163 | 137 | 300 |
| Barrio Logan | 116 | 91 | 207 |
| Black Mountain Ranch | 27 | 24 | 51 |
| Carmel Mountain Ranch | 14 | 83 | 97 |
| Carmel Valley | 83 | 61 | 144 |
| ... | ... | ... | ... |
| Torrey Hills | 22 | 21 | 43 |
| Torrey Pines | 39 | 64 | 103 |
| University | 114 | 214 | 328 |
| Uptown | 592 | 536 | 1128 |
| Via De La Valle | 0 | 2 | 2 |
57 rows × 3 columns
.sum(), in this case – is applied to each column.'service' column.# Can you guess how the max is determined?
requests.groupby('neighborhood').max()
| service | closed | open | total | |
|---|---|---|---|---|
| neighborhood | ||||
| Balboa Park | Tree Maintenance | 62 | 37 | 99 |
| Barrio Logan | Waste on Private Property | 44 | 26 | 47 |
| Black Mountain Ranch | Tree Maintenance | 14 | 6 | 20 |
| Carmel Mountain Ranch | Trash/Recycling Collection | 3 | 59 | 62 |
| Carmel Valley | Tree Maintenance | 55 | 17 | 72 |
| ... | ... | ... | ... | ... |
| Torrey Hills | Tree Maintenance | 16 | 5 | 18 |
| Torrey Pines | Tree Maintenance | 15 | 29 | 44 |
| University | Weed Cleanup | 16 | 101 | 116 |
| Uptown | Weed Cleanup | 227 | 94 | 321 |
| Via De La Valle | Pothole | 0 | 2 | 2 |
57 rows × 4 columns
requests.groupby('neighborhood').max()
| service | closed | open | total | |
|---|---|---|---|---|
| neighborhood | ||||
| Balboa Park | Tree Maintenance | 62 | 37 | 99 |
| Barrio Logan | Waste on Private Property | 44 | 26 | 47 |
| Black Mountain Ranch | Tree Maintenance | 14 | 6 | 20 |
| Carmel Mountain Ranch | Trash/Recycling Collection | 3 | 59 | 62 |
| Carmel Valley | Tree Maintenance | 55 | 17 | 72 |
| ... | ... | ... | ... | ... |
| Torrey Hills | Tree Maintenance | 16 | 5 | 18 |
| Torrey Pines | Tree Maintenance | 15 | 29 | 44 |
| University | Weed Cleanup | 16 | 101 | 116 |
| Uptown | Weed Cleanup | 227 | 94 | 321 |
| Via De La Valle | Pothole | 0 | 2 | 2 |
57 rows × 4 columns
Why isn't the 'total' column equal to the sum of the 'closed' and 'open' columns, as it originally was?
.groupby¶How do we find the number of different services requested in each neighborhood?
Two choices:
.count(), .sum(), .mean(), .median(), .max(), and .min().# Answering the question for one particular neighborhood, La Jolla
requests[requests.get('neighborhood') == 'La Jolla']
| neighborhood | service | closed | open | total | |
|---|---|---|---|---|---|
| 260 | La Jolla | Dead Animal | 2 | 0 | 2 |
| 261 | La Jolla | Encampment | 13 | 8 | 21 |
| 262 | La Jolla | Environmental Services Code Compliance | 1 | 3 | 4 |
| 263 | La Jolla | Graffiti - Public | 3 | 2 | 5 |
| 264 | La Jolla | Illegal Dumping | 8 | 7 | 15 |
| ... | ... | ... | ... | ... | ... |
| 279 | La Jolla | Traffic Sign Maintenance | 2 | 24 | 26 |
| 280 | La Jolla | Traffic Signal Issue | 11 | 1 | 12 |
| 281 | La Jolla | Traffic Signal Timing | 3 | 1 | 4 |
| 282 | La Jolla | Trash/Recycling Collection | 1 | 1 | 2 |
| 283 | La Jolla | Tree Maintenance | 11 | 14 | 25 |
24 rows × 5 columns
...
Ellipsis
.groupby don't make sense when using the .count() aggregation method..assign to create a new column containing the same values as the old column(s)..drop(columns=list_of_column_labels) to drop the old column(s). Alternatively, use .get(list_of_column_labels) to keep only the columns in the given list. The columns will appear in the order you specify, so this is also useful for reordering columns!num_diff_services = requests.groupby('neighborhood').count()
num_diff_services
| service | closed | open | total | |
|---|---|---|---|---|
| neighborhood | ||||
| Balboa Park | 22 | 22 | 22 | 22 |
| Barrio Logan | 21 | 21 | 21 | 21 |
| Black Mountain Ranch | 15 | 15 | 15 | 15 |
| Carmel Mountain Ranch | 18 | 18 | 18 | 18 |
| Carmel Valley | 20 | 20 | 20 | 20 |
| ... | ... | ... | ... | ... |
| Torrey Hills | 11 | 11 | 11 | 11 |
| Torrey Pines | 21 | 21 | 21 | 21 |
| University | 25 | 25 | 25 | 25 |
| Uptown | 29 | 29 | 29 | 29 |
| Via De La Valle | 1 | 1 | 1 | 1 |
57 rows × 4 columns
num_diff_services = num_diff_services.assign(
count_of_services=num_diff_services.get('open')
).drop(columns=['service', 'closed', 'open', 'total'])
num_diff_services
| count_of_services | |
|---|---|
| neighborhood | |
| Balboa Park | 22 |
| Barrio Logan | 21 |
| Black Mountain Ranch | 15 |
| Carmel Mountain Ranch | 18 |
| Carmel Valley | 20 |
| ... | ... |
| Torrey Hills | 11 |
| Torrey Pines | 21 |
| University | 25 |
| Uptown | 29 |
| Via De La Valle | 1 |
57 rows × 1 columns
imdb = bpd.read_csv('data/imdb.csv').set_index('Title').sort_values(by='Rating')
imdb
| Votes | Rating | Year | Decade | |
|---|---|---|---|---|
| Title | ||||
| Akira | 91652 | 8.0 | 1988 | 1980 |
| Per un pugno di dollari | 124671 | 8.0 | 1964 | 1960 |
| Guardians of the Galaxy | 527349 | 8.0 | 2014 | 2010 |
| The Man Who Shot Liberty Valance | 49135 | 8.0 | 1962 | 1960 |
| Underground | 39447 | 8.0 | 1995 | 1990 |
| ... | ... | ... | ... | ... |
| Schindler's List | 761224 | 8.9 | 1993 | 1990 |
| 12 Angry Men | 384187 | 8.9 | 1957 | 1950 |
| The Godfather: Part II | 692753 | 9.0 | 1974 | 1970 |
| The Shawshank Redemption | 1498733 | 9.2 | 1994 | 1990 |
| The Godfather | 1027398 | 9.2 | 1972 | 1970 |
250 rows × 4 columns
imdb.groupby('Decade').count()
| Votes | Rating | Year | |
|---|---|---|---|
| Decade | |||
| 1920 | 4 | 4 | 4 |
| 1930 | 7 | 7 | 7 |
| 1940 | 14 | 14 | 14 |
| 1950 | 30 | 30 | 30 |
| 1960 | 22 | 22 | 22 |
| 1970 | 21 | 21 | 21 |
| 1980 | 31 | 31 | 31 |
| 1990 | 42 | 42 | 42 |
| 2000 | 50 | 50 | 50 |
| 2010 | 29 | 29 | 29 |
# We'll learn how to make plots like this in the next lecture!
imdb.groupby('Decade').count().plot(y='Year');
Let's try to do this two different ways.
%%pt
imdb[imdb.get('Decade') == 1990].sort_values('Rating', ascending=False).index[0]
Note: The command to extract the index of a DataFrame is .index - no parentheses! This is different than the way we extract columns, with .get(), because the index is not a column.
%%pt
imdb.reset_index().groupby('Decade').max()
.max()), aggregation is done to each column individually. max is not helpful here.We won't cover these problems in class, but they're here for you to practice with some harder examples. To access the solutions, you'll need to watch this solution walkthrough video (start at 10:00).
Before watching the video, make sure to try these problems on your own – they're great prep for homeworks, projects, and exams!
.sum() on a Boolean array/Series¶True elements. This is because Python treats True as 1 and False as 0.
Hint: Use .str.len() on the column or index that contains the names of the movies.
& if you require all to be true, or | if you require at least one to be true.df.groupby(column_name).agg_method() aggregates all rows with the same value for column_name into a single row in the resulting DataFrame, using agg_method() to combine values..count(), .sum(), .mean(), .median(), .max(), and .min().A picture is worth a 1000 words – it's time to visualize!