import pandas as pd
import numpy as np
import os


elections_fp = os.path.join('data', 'elections.csv')
elections = pd.read_csv(elections_fp)
elections.head(10)


(
    elections
    .assign(proportion_of_vote=(elections['%'] / 100))
    .head()
)


(
    elections
    .assign(proportion_of_vote=(elections['%'] / 100))
    .assign(Result=elections['Result'].str.upper())
    .head()
)


(
    elections
    .assign(**{'Proportion of Vote': elections['%'] / 100})
    .head()
)


# By default, .copy() returns a deep copy of the object it is called on,
# meaning that if you change the copy the original remains unmodified.
mod_elec = elections.copy()
mod_elec.head()


mod_elec['Proportion of Vote'] = mod_elec['%'] / 100
mod_elec.head()


mod_elec['Result'] = mod_elec['Result'].str.upper()
mod_elec.head()


# 🤔
mod_elec.loc[-1, :] = ['Carter', 'Democratic', 50.1, 1976, 'WIN', 0.501]
mod_elec.loc[-2, :] = ['Ford', 'Republican', 48.0, 1976, 'LOSS', 0.48]
mod_elec


mod_elec = mod_elec.sort_index()
mod_elec.head()


# df.reset_index(drop=True) drops the current index 
# of the DataFrame and replaces it with an index of increasing integers
mod_elec.reset_index(drop=True)


students = pd.read_csv(os.path.join('data', 'students.csv'))
students


students


total = students['2021 tuition'] + students['2022 tuition']
total

0    $40000.00$50000.00
1     $9200.00$10120.00
2    $50000.00$62500.00
3      $7000.00$9800.00
4     $10000.00$5000.00
dtype: object


students.dtypes

Student ID            object
Student Name          object
Month                  int64
Day                    int64
Year                   int64
2021 tuition          object
2022 tuition          object
Percent Growth        object
Paid                  object
DSC 80 Final Grade    object
dtype: object


# This won't work. Why?
students['2021 tuition'].astype(float)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/var/folders/pd/w73mdrsj2836_7gp0brr2q7r0000gn/T/ipykernel_68306/2133079742.py in <module>
      1 # This won't work. Why?
----> 2 students['2021 tuition'].astype(float)

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
   5918         else:
   5919             # else, only a single dtype is given
-> 5920             new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   5921             return self._constructor(new_data).__finalize__(self, method="astype")
   5922 

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
    417 
    418     def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
--> 419         return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
    420 
    421     def convert(

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
    302                     applied = b.apply(f, **kwargs)
    303                 else:
--> 304                     applied = getattr(b, f)(**kwargs)
    305             except (TypeError, NotImplementedError):
    306                 if not ignore_failures:

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
    578         values = self.values
    579 
--> 580         new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
    581 
    582         new_values = maybe_coerce_values(new_values)

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/dtypes/cast.py in astype_array_safe(values, dtype, copy, errors)
   1290 
   1291     try:
-> 1292         new_values = astype_array(values, dtype, copy=copy)
   1293     except (ValueError, TypeError):
   1294         # e.g. astype_nansafe can fail on object-dtype of strings

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/dtypes/cast.py in astype_array(values, dtype, copy)
   1235 
   1236     else:
-> 1237         values = astype_nansafe(values, dtype, copy=copy)
   1238 
   1239     # in pandas we don't store numpy str dtypes, so convert to object

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
   1179     if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype):
   1180         # Explicit copy, or required since NumPy can't view from / to object.
-> 1181         return arr.astype(dtype, copy=True)
   1182 
   1183     return arr.astype(dtype, copy=copy)

ValueError: could not convert string to float: '$40000.00'


# That's better!
students['2021 tuition'].str.strip('$').astype(float)

0    40000.0
1     9200.0
2    50000.0
3     7000.0
4    10000.0
Name: 2021 tuition, dtype: float64


for col in students.columns:
    if 'tuition' in col:
        students[col] = students[col].str.strip('$').astype(float)
        
students


cols = students.columns.str.contains('tuition')
students.loc[:, cols] = students.loc[:, cols].astype(float)
students


students['Paid'].replace({'Y': True, 'N': False})

0    False
1     True
2    False
3     True
4     True
Name: Paid, dtype: bool


students['Paid'].value_counts()

Y    3
N    2
Name: Paid, dtype: int64


students['Paid'] = students['Paid'] == 'Y'
students


(
    students['Year'].astype(str) + '-' + 
    students['Month'].astype(str).str.zfill(2) + '-' + 
    students['Day'].astype(str).str.zfill(2)
)

0    2020-10-12
1    2019-04-15
2    2021-05-14
3    2020-07-10
4    2021-03-03
dtype: object


# Won't work!
students['DSC 80 Final Grade'].astype(int)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/var/folders/pd/w73mdrsj2836_7gp0brr2q7r0000gn/T/ipykernel_68306/1326609327.py in <module>
      1 # Won't work!
----> 2 students['DSC 80 Final Grade'].astype(int)

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
   5918         else:
   5919             # else, only a single dtype is given
-> 5920             new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   5921             return self._constructor(new_data).__finalize__(self, method="astype")
   5922 

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
    417 
    418     def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
--> 419         return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
    420 
    421     def convert(

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
    302                     applied = b.apply(f, **kwargs)
    303                 else:
--> 304                     applied = getattr(b, f)(**kwargs)
    305             except (TypeError, NotImplementedError):
    306                 if not ignore_failures:

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
    578         values = self.values
    579 
--> 580         new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
    581 
    582         new_values = maybe_coerce_values(new_values)

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/dtypes/cast.py in astype_array_safe(values, dtype, copy, errors)
   1290 
   1291     try:
-> 1292         new_values = astype_array(values, dtype, copy=copy)
   1293     except (ValueError, TypeError):
   1294         # e.g. astype_nansafe can fail on object-dtype of strings

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/dtypes/cast.py in astype_array(values, dtype, copy)
   1235 
   1236     else:
-> 1237         values = astype_nansafe(values, dtype, copy=copy)
   1238 
   1239     # in pandas we don't store numpy str dtypes, so convert to object

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
   1152         # work around NumPy brokenness, #1987
   1153         if np.issubdtype(dtype.type, np.integer):
-> 1154             return lib.astype_intsafe(arr, dtype)
   1155 
   1156         # if we have a datetime/timedelta array of objects

~/opt/anaconda3/lib/python3.9/site-packages/pandas/_libs/lib.pyx in pandas._libs.lib.astype_intsafe()

ValueError: invalid literal for int() with base 10: 'Pass'


pd.to_numeric(students['DSC 80 Final Grade'], errors='coerce')

0    89.0
1    90.0
2    97.0
3    54.0
4     NaN
Name: DSC 80 Final Grade, dtype: float64


students['DSC 80 Final Grade'] = pd.to_numeric(students['DSC 80 Final Grade'], errors='coerce')
students


pd.to_numeric?


students['Student Name']

0    John Black
1    Mark White
2       Amy Red
3     Tom Green
4     Rose Pink
Name: Student Name, dtype: object


def transpose_name(name):
    firstname, lastname = name.split()
    return lastname + ', ' + firstname

transpose_name('King Triton')

'Triton, King'


students['Student Name'].apply(transpose_name)

0    Black, John
1    White, Mark
2       Red, Amy
3     Green, Tom
4     Pink, Rose
Name: Student Name, dtype: object


parts = students['Student Name'].str.split()
parts

0    [John, Black]
1    [Mark, White]
2       [Amy, Red]
3     [Tom, Green]
4     [Rose, Pink]
Name: Student Name, dtype: object


parts.str[1] + ', ' + parts.str[0]

0    Black, John
1    White, Mark
2       Red, Amy
3     Green, Tom
4     Pink, Rose
Name: Student Name, dtype: object


import yaml

player = '''
name: Magnus Carlsen
age: 31
country: NO
'''


yaml.safe_load(player)

{'name': 'Magnus Carlsen', 'age': 31, 'country': False}


stops = pd.read_csv('data/vehicle_stops_2016_datasd.csv')
stops.head()


stops.head(1)


stops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103051 entries, 0 to 103050
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   stop_id           103051 non-null  int64 
 1   stop_cause        103044 non-null  object
 2   service_area      103051 non-null  object
 3   subject_race      102920 non-null  object
 4   subject_sex       102865 non-null  object
 5   subject_age       100284 non-null  object
 6   timestamp         102879 non-null  object
 7   stop_date         103051 non-null  object
 8   stop_time         103051 non-null  object
 9   sd_resident       83689 non-null   object
 10  arrested          84400 non-null   object
 11  searched          83330 non-null   object
 12  obtained_consent  4791 non-null    object
 13  contraband_found  4969 non-null    object
 14  property_seized   4924 non-null    object
dtypes: int64(1), object(14)
memory usage: 11.8+ MB


stops['subject_age'].unique()

array(['28', '25', '31', '29', '52', '24', '20', '50', '23', '54', '53',
       '35', '38', '18', '48', '68', '45', '63', '49', '42', '27', '19',
       '55', '32', '47', '33', '41', '59', '60', '58', '26', '36', '40',
       '39', '21', '64', '30', '43', '17', '51', '34', '56', '44', '22',
       '69', '46', '16', '57', '37', '65', '72', '67', '66', '70', '62',
       '73', '74', '0', '77', nan, '89', '79', '61', '78', '99', '75',
       '85', '82', '71', '15', '80', '81', '93', '84', '76', '2', '4',
       '86', '91', '83', '88', '98', '87', 'No Age', '9', '100', '14',
       '95', '96', '92', '119', '1', '90', '163', '5', '114', '94', '10',
       '212', '220', '6', '145', '97', '120'], dtype=object)


ages = pd.to_numeric(stops['subject_age'], errors='coerce')
ages.describe()

count    99648.000000
mean        37.277697
std         14.456934
min          0.000000
25%         25.000000
50%         34.000000
75%         47.000000
max        220.000000
Name: subject_age, dtype: float64


stops.loc[ages > 100]


ages.loc[(ages >= 0) & (ages < 16)].value_counts()

0.0     218
15.0     27
2.0       6
14.0      5
4.0       4
1.0       2
10.0      2
9.0       1
5.0       1
6.0       1
Name: subject_age, dtype: int64


stops.loc[(ages >= 0) & (ages < 16)]

	Candidate	Party	%	Year	Result
0	Reagan	Republican	50.7	1980	win
1	Carter	Democratic	41.0	1980	loss
2	Anderson	Independent	6.6	1980	loss
3	Reagan	Republican	58.8	1984	win
4	Mondale	Democratic	37.6	1984	loss
5	Bush	Republican	53.4	1988	win
6	Dukakis	Democratic	45.6	1988	loss
7	Clinton	Democratic	43.0	1992	win
8	Bush	Republican	37.4	1992	loss
9	Perot	Independent	18.9	1992	loss

	Candidate	Party	%	Year	Result	proportion_of_vote
0	Reagan	Republican	50.7	1980	win	0.507
1	Carter	Democratic	41.0	1980	loss	0.410
2	Anderson	Independent	6.6	1980	loss	0.066
3	Reagan	Republican	58.8	1984	win	0.588
4	Mondale	Democratic	37.6	1984	loss	0.376

	Candidate	Party	%	Year	Result	proportion_of_vote
0	Reagan	Republican	50.7	1980	WIN	0.507
1	Carter	Democratic	41.0	1980	LOSS	0.410
2	Anderson	Independent	6.6	1980	LOSS	0.066
3	Reagan	Republican	58.8	1984	WIN	0.588
4	Mondale	Democratic	37.6	1984	LOSS	0.376

	Candidate	Party	%	Year	Result	Proportion of Vote
0	Reagan	Republican	50.7	1980	win	0.507
1	Carter	Democratic	41.0	1980	loss	0.410
2	Anderson	Independent	6.6	1980	loss	0.066
3	Reagan	Republican	58.8	1984	win	0.588
4	Mondale	Democratic	37.6	1984	loss	0.376

	Candidate	Party	%	Year	Result
0	Reagan	Republican	50.7	1980	win
1	Carter	Democratic	41.0	1980	loss
2	Anderson	Independent	6.6	1980	loss
3	Reagan	Republican	58.8	1984	win
4	Mondale	Democratic	37.6	1984	loss

Lecture 4 – Messy Data¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Recap: adding columns¶

Adding and modifying columns, using a copy¶

Adding and modifying columns, in-place¶

Introduction to messy data¶

There is no such thing as "raw data"!¶

Data generating process¶

Example: COVID case counts 🦠¶

Data provenance¶

Data cleaning 🧹¶

Keys to data cleaning¶

Kinds of data¶

Kinds of data¶

Discussion Question¶

Example: DSC 80 students¶

How much has each student paid in total tuition in 2021 and 2022?¶

Check the data types of `students`!¶

Cleaning `'2021 tuition'` and `'2022 tuition'`¶

Cleaning `'Paid'`¶

Cleaning `'Month'`, `'Day'`, and `'Year'`¶

Cleaning `'DSC 80 Final Grade'`¶

Cleaning `'Student Name'`¶

Aside: string methods¶

More data type ambiguities¶

Example: the Norway problem 🇳🇴¶

Unfaithful data¶

Is the data "faithful" to the DGP?¶

Is the data "faithful" to the DGP?¶

Example: Police vehicle stops 🚔¶

General questions¶

Data types¶

Unfaithfulness¶

Unfaithful `'subject_age'`¶

Summary, next time¶

Summary¶

	Candidate	Party	%	Year	Result	Proportion of Vote
-2	Ford	Republican	48.0	1976.0	LOSS	0.480
-1	Carter	Democratic	50.1	1976.0	WIN	0.501
0	Reagan	Republican	50.7	1980.0	WIN	0.507
1	Carter	Democratic	41.0	1980.0	LOSS	0.410
2	Anderson	Independent	6.6	1980.0	LOSS	0.066

	Student ID	Student Name	Month	Day	Year	2021 tuition	2022 tuition	Percent Growth	Paid	DSC 80 Final Grade
0	A20104523	John Black	10	12	2020	$40000.00	$50000.00	25.00%	N	89
1	A20345992	Mark White	4	15	2019	$9200.00	$10120.00	10.00%	Y	90
2	A21942188	Amy Red	5	14	2021	$50000.00	$62500.00	25.00%	N	97
3	A28049910	Tom Green	7	10	2020	$7000.00	$9800.00	40.00%	Y	54
4	A27456704	Rose Pink	3	3	2021	$10000.00	$5000.00	-50.00%	Y	Pass

	stop_id	stop_cause	service_area	subject_race	subject_sex	subject_age	timestamp	stop_date	stop_time	sd_resident	arrested	searched	obtained_consent	contraband_found	property_seized
0	1308198	Equipment Violation	530	W	M	28	2016-01-01 00:06:00	2016-01-01	0:06	Y	N	N	N	N	N
1	1308172	Moving Violation	520	B	M	25	2016-01-01 00:10:00	2016-01-01	0:10	N	N	N	NaN	NaN	NaN
2	1308171	Moving Violation	110	H	F	31	2016-01-01 00:14:00	2016-01-01	0:14	NaN	NaN	NaN	NaN	NaN	NaN
3	1308170	Moving Violation	Unknown	W	F	29	2016-01-01 00:16:00	2016-01-01	0:16	N	N	N	NaN	NaN	NaN
4	1308197	Moving Violation	230	W	M	52	2016-01-01 00:30:00	2016-01-01	0:30	N	N	N	NaN	NaN	NaN

	stop_id	stop_cause	service_area	subject_race	subject_sex	subject_age	timestamp	stop_date	stop_time	sd_resident	arrested	searched	obtained_consent	contraband_found	property_seized
25029	1333254	Equipment Violation	820	W	F	119	2016-03-19 22:20:00	2016-03-19	22:20	N	N	N	NaN	NaN	NaN
34179	1375483	Moving Violation	510	W	M	163	2016-04-18 16:35:00	2016-04-18	16:35	N	N	N	NaN	NaN	NaN
36968	1378369	Moving Violation	240	A	F	114	2016-04-28 11:44:00	2016-04-28	11:44	NaN	NaN	NaN	NaN	NaN	NaN
63570	1405478	Moving Violation	110	V	M	212	2016-08-04 18:10:00	2016-08-04	18:10	Y	N	N	NaN	NaN	NaN
70267	1411694	Moving Violation	310	H	F	220	2016-08-30 18:28:00	2016-08-30	18:28	Y	N	N	NaN	NaN	NaN
77038	1418885	Moving Violation	830	B	M	145	2016-09-22 20:35:00	2016-09-22	20:35	Y	N	Y	N	N	NaN
99449	1440889	Equipment Violation	120	W	F	120	2016-12-15 19:28:00	2016-12-15	19:28	Y	N	N	NaN	NaN	NaN

	stop_id	stop_cause	service_area	subject_race	subject_sex	subject_age	timestamp	stop_date	stop_time	sd_resident	arrested	searched	obtained_consent	contraband_found	property_seized
547	1308628	Moving Violation	120	W	M	0	2016-01-03 16:20:00	2016-01-03	16:20	NaN	NaN	NaN	NaN	NaN	NaN
747	1308820	Moving Violation	Unknown	H	F	0	2016-01-04 22:13:00	2016-01-04	22:13	N	N	N	NaN	NaN	NaN
1686	1309896	Moving Violation	120	W	M	15	2016-01-08 19:30:00	2016-01-08	19:30	N	N	N	NaN	NaN	NaN
1752	1309975	Moving Violation	Unknown	O	M	0	2016-01-08 23:20:00	2016-01-08	23:20	N	N	N	NaN	NaN	NaN
2054	1310377	Equipment Violation	430	H	F	0	2016-01-10 16:42:00	2016-01-10	16:42	Y	N	N	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
100465	1442806	Moving Violation	810	O	M	15	2016-12-20 10:45:00	2016-12-20	10:45	Y	N	N	NaN	NaN	NaN
101495	1443500	Moving Violation	320	W	M	0	2016-12-23 22:30:00	2016-12-23	22:30	NaN	NaN	NaN	NaN	NaN	NaN
101784	1443745	Moving Violation	Unknown	O	M	0	2016-12-26 19:30:00	2016-12-26	19:30	NaN	NaN	NaN	NaN	NaN	NaN
101790	1443747	Moving Violation	320	W	F	0	2016-12-26 20:00:00	2016-12-26	20:00	NaN	NaN	NaN	NaN	NaN	NaN
102848	1444647	Moving Violation	320	W	M	0	2016-12-31 17:45:00	2016-12-31	17:45	NaN	NaN	NaN	NaN	NaN	NaN

Lecture 4 – Messy Data¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Recap: adding columns¶

Adding and modifying columns, using a copy¶

Adding and modifying columns, in-place¶

Introduction to messy data¶

There is no such thing as "raw data"!¶

Data generating process¶

Example: COVID case counts 🦠¶

Data provenance¶

Data cleaning 🧹¶

Keys to data cleaning¶

Kinds of data¶

Kinds of data¶

Discussion Question¶

Example: DSC 80 students¶

How much has each student paid in total tuition in 2021 and 2022?¶

Check the data types of students!¶

Cleaning '2021 tuition' and '2022 tuition'¶

Cleaning 'Paid'¶

Cleaning 'Month', 'Day', and 'Year'¶

Cleaning 'DSC 80 Final Grade'¶

Cleaning 'Student Name'¶

Aside: string methods¶

More data type ambiguities¶

Example: the Norway problem 🇳🇴¶

Unfaithful data¶

Is the data "faithful" to the DGP?¶

Is the data "faithful" to the DGP?¶

Example: Police vehicle stops 🚔¶

General questions¶

Data types¶

Unfaithfulness¶

Unfaithful 'subject_age'¶

Summary, next time¶

Summary¶

Check the data types of `students`!¶

Cleaning `'2021 tuition'` and `'2022 tuition'`¶

Cleaning `'Paid'`¶

Cleaning `'Month'`, `'Day'`, and `'Year'`¶

Cleaning `'DSC 80 Final Grade'`¶

Cleaning `'Student Name'`¶

Unfaithful `'subject_age'`¶