import pandas as pd
import numpy as np
import os


temps = pd.DataFrame({
    'City': ['San Diego', 'Toronto', 'Rome'],
    'Temperature': [76, 28, 56],
    'Humid': ['No', 'Yes', 'Yes']
})

other_temps = pd.DataFrame({
    'City': ['Los Angeles', 'San Diego', 'Miami'],
    'Temperature': [79, 76, 88],
    'Humid': ['No', 'No', 'Yes']
})

countries = pd.DataFrame({
    'City': ['Toronto', 'Shanghai', 'San Diego'],
    'Country': ['Canada', 'China', 'USA']
})


temps


other_temps


countries


temps[['City','Humid']].drop_duplicates()


temps.loc[temps['Temperature']>50]


temps.loc[temps['Temperature'] > 50, ['City', 'Humid']].drop_duplicates()


# Could also use temps.merge(countries, how='cross').
pd.merge(temps, countries, how='cross')


both = pd.merge(temps, countries, how='cross')
both[both['City_x'] == both['City_y']]


temps.merge(countries)


pd.concat([temps, other_temps]).drop_duplicates()


temps[~temps['City'].isin(other_temps['City'])]


students = pd.read_csv(os.path.join('data', 'students.csv'))
students


students


total = students['2021 tuition'] + students['2022 tuition']
total

0    $40000.00$50000.00
1     $9200.00$10120.00
2    $50000.00$62500.00
3      $7000.00$9800.00
4     $10000.00$5000.00
dtype: object


students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Student ID          5 non-null      object
 1   Student Name        5 non-null      object
 2   Month               5 non-null      int64 
 3   Day                 5 non-null      int64 
 4   Year                5 non-null      int64 
 5   2021 tuition        5 non-null      object
 6   2022 tuition        5 non-null      object
 7   Percent Growth      5 non-null      object
 8   Paid                5 non-null      object
 9   DSC 80 Final Grade  5 non-null      object
dtypes: int64(3), object(7)
memory usage: 528.0+ bytes


# This won't work. Why?
students['2021 tuition'].astype(float)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [18], in <cell line: 2>()
      1 # This won't work. Why?
----> 2 students['2021 tuition'].astype(float)

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/generic.py:5815, in NDFrame.astype(self, dtype, copy, errors)
   5808     results = [
   5809         self.iloc[:, i].astype(dtype, copy=copy)
   5810         for i in range(len(self.columns))
   5811     ]
   5813 else:
   5814     # else, only a single dtype is given
-> 5815     new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   5816     return self._constructor(new_data).__finalize__(self, method="astype")
   5818 # GH 33113: handle empty frame or series

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/internals/managers.py:418, in BaseBlockManager.astype(self, dtype, copy, errors)
    417 def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
--> 418     return self.apply("astype", dtype=dtype, copy=copy, errors=errors)

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/internals/managers.py:327, in BaseBlockManager.apply(self, f, align_keys, ignore_failures, **kwargs)
    325         applied = b.apply(f, **kwargs)
    326     else:
--> 327         applied = getattr(b, f)(**kwargs)
    328 except (TypeError, NotImplementedError):
    329     if not ignore_failures:

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/internals/blocks.py:592, in Block.astype(self, dtype, copy, errors)
    574 """
    575 Coerce to the new dtype.
    576 
   (...)
    588 Block
    589 """
    590 values = self.values
--> 592 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
    594 new_values = maybe_coerce_values(new_values)
    595 newb = self.make_block(new_values)

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/dtypes/cast.py:1309, in astype_array_safe(values, dtype, copy, errors)
   1306 dtype = pandas_dtype(dtype)
   1308 try:
-> 1309     new_values = astype_array(values, dtype, copy=copy)
   1310 except (ValueError, TypeError):
   1311     # e.g. astype_nansafe can fail on object-dtype of strings
   1312     #  trying to convert to float
   1313     if errors == "ignore":

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/dtypes/cast.py:1257, in astype_array(values, dtype, copy)
   1254     values = values.astype(dtype, copy=copy)
   1256 else:
-> 1257     values = astype_nansafe(values, dtype, copy=copy)
   1259 # in pandas we don't store numpy str dtypes, so convert to object
   1260 if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str):

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/dtypes/cast.py:1201, in astype_nansafe(arr, dtype, copy, skipna)
   1197     raise ValueError(msg)
   1199 if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype):
   1200     # Explicit copy, or required since NumPy can't view from / to object.
-> 1201     return arr.astype(dtype, copy=True)
   1203 return arr.astype(dtype, copy=copy)

ValueError: could not convert string to float: '$40000.00'


# That's better!
students['2021 tuition'].str.strip('$').astype(float)

0    40000.0
1     9200.0
2    50000.0
3     7000.0
4    10000.0
Name: 2021 tuition, dtype: float64


students = pd.read_csv(os.path.join('data', 'students.csv'))
for col in students.columns:
    if 'tuition' in col:
        students[col] = students[col].str.strip('$').astype(float)
        
students


students = pd.read_csv(os.path.join('data', 'students.csv'))
cols = students.columns.str.contains('tuition')
students.loc[:, cols] = students.loc[:, cols].apply(lambda s: s.str.strip('$').astype(float), axis=0)
students


students['Paid'].value_counts()

Y    3
N    2
Name: Paid, dtype: int64


students['Paid'] = students['Paid'] == 'Y'
students


students.loc[:, 'Month': 'Year']


students['Date'] = pd.to_datetime(students.loc[:, 'Month': 'Year'])
students = students.drop(columns=['Month', 'Day', 'Year'])
students


# Won't work!
students['DSC 80 Final Grade'].astype(int)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [26], in <cell line: 2>()
      1 # Won't work!
----> 2 students['DSC 80 Final Grade'].astype(int)

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/generic.py:5815, in NDFrame.astype(self, dtype, copy, errors)
   5808     results = [
   5809         self.iloc[:, i].astype(dtype, copy=copy)
   5810         for i in range(len(self.columns))
   5811     ]
   5813 else:
   5814     # else, only a single dtype is given
-> 5815     new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   5816     return self._constructor(new_data).__finalize__(self, method="astype")
   5818 # GH 33113: handle empty frame or series

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/internals/managers.py:418, in BaseBlockManager.astype(self, dtype, copy, errors)
    417 def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
--> 418     return self.apply("astype", dtype=dtype, copy=copy, errors=errors)

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/internals/managers.py:327, in BaseBlockManager.apply(self, f, align_keys, ignore_failures, **kwargs)
    325         applied = b.apply(f, **kwargs)
    326     else:
--> 327         applied = getattr(b, f)(**kwargs)
    328 except (TypeError, NotImplementedError):
    329     if not ignore_failures:

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/internals/blocks.py:592, in Block.astype(self, dtype, copy, errors)
    574 """
    575 Coerce to the new dtype.
    576 
   (...)
    588 Block
    589 """
    590 values = self.values
--> 592 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
    594 new_values = maybe_coerce_values(new_values)
    595 newb = self.make_block(new_values)

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/dtypes/cast.py:1309, in astype_array_safe(values, dtype, copy, errors)
   1306 dtype = pandas_dtype(dtype)
   1308 try:
-> 1309     new_values = astype_array(values, dtype, copy=copy)
   1310 except (ValueError, TypeError):
   1311     # e.g. astype_nansafe can fail on object-dtype of strings
   1312     #  trying to convert to float
   1313     if errors == "ignore":

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/dtypes/cast.py:1257, in astype_array(values, dtype, copy)
   1254     values = values.astype(dtype, copy=copy)
   1256 else:
-> 1257     values = astype_nansafe(values, dtype, copy=copy)
   1259 # in pandas we don't store numpy str dtypes, so convert to object
   1260 if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str):

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/dtypes/cast.py:1174, in astype_nansafe(arr, dtype, copy, skipna)
   1170 elif is_object_dtype(arr):
   1171 
   1172     # work around NumPy brokenness, #1987
   1173     if np.issubdtype(dtype.type, np.integer):
-> 1174         return lib.astype_intsafe(arr, dtype)
   1176     # if we have a datetime/timedelta array of objects
   1177     # then coerce to a proper dtype and recall astype_nansafe
   1179     elif is_datetime64_dtype(dtype):

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/_libs/lib.pyx:679, in pandas._libs.lib.astype_intsafe()

ValueError: invalid literal for int() with base 10: 'Pass'


pd.to_numeric(students['DSC 80 Final Grade'], errors='coerce')

0    89.0
1    90.0
2    97.0
3    54.0
4     NaN
Name: DSC 80 Final Grade, dtype: float64


students['DSC 80 Final Grade'] = pd.to_numeric(students['DSC 80 Final Grade'], errors='coerce')
students


pd.to_numeric?


students['Student Name']

0    John Black
1    Mark White
2       Amy Red
3     Tom Green
4     Rose Pink
Name: Student Name, dtype: object


parts = students['Student Name'].str.split()
parts

0    [John, Black]
1    [Mark, White]
2       [Amy, Red]
3     [Tom, Green]
4     [Rose, Pink]
Name: Student Name, dtype: object


students['Student Name'] = parts.str[1] + ', ' + parts.str[0]
students


import yaml

player = '''
name: Magnus Carlsen
age: 32
country: NO
'''


yaml.safe_load(player)

{'name': 'Magnus Carlsen', 'age': 32, 'country': False}

Lecture 7 – Relational Algebra, Messy Data¶

DSC 80, Spring 2023¶

Agenda¶

Relational algebra¶

Birds-eye view of the course¶

Relational algebra¶

Projection ($\Pi$)¶

Selection ($\sigma$)¶

Cross product ($\times$)¶

Union ($\cup$)¶

Difference ($-$)¶

Brief summary¶

Introduction to messy data¶

There is no such thing as "raw data"!¶

Data generating process¶

Example: COVID case counts 🦠¶

Data provenance¶

Data cleaning 🧹¶

Keys to data cleaning¶

Kinds of data¶

Kinds of data¶

Discussion Question¶

Example: DSC 80 students¶

How much has each student paid in total tuition in 2021 and 2022?¶

Check the data types of `students`!¶

Cleaning `'2021 tuition'` and `'2022 tuition'`¶

Cleaning `'Paid'`¶

Cleaning `'Month'`, `'Day'`, and `'Year'`¶

Cleaning `'DSC 80 Final Grade'`¶

Cleaning `'Student Name'`¶

More data type ambiguities¶

Example: the Norway problem 🇳🇴¶

Unfaithful data¶

Is the data "faithful" to the DGP?¶

Is the data "faithful" to the DGP?¶

Summary, next time¶

Summary¶

Next time¶

	City_x	Temperature	Humid	City_y	Country
0	San Diego	76	No	Toronto	Canada
1	San Diego	76	No	Shanghai	China
2	San Diego	76	No	San Diego	USA
3	Toronto	28	Yes	Toronto	Canada
4	Toronto	28	Yes	Shanghai	China
5	Toronto	28	Yes	San Diego	USA
6	Rome	56	Yes	Toronto	Canada
7	Rome	56	Yes	Shanghai	China
8	Rome	56	Yes	San Diego	USA

	Student ID	Student Name	Month	Day	Year	2021 tuition	2022 tuition	Percent Growth	Paid	DSC 80 Final Grade
0	A20104523	John Black	10	12	2020	$40000.00	$50000.00	25.00%	N	89
1	A20345992	Mark White	4	15	2019	$9200.00	$10120.00	10.00%	Y	90
2	A21942188	Amy Red	5	14	2021	$50000.00	$62500.00	25.00%	N	97
3	A28049910	Tom Green	7	10	2020	$7000.00	$9800.00	40.00%	Y	54
4	A27456704	Rose Pink	3	3	2021	$10000.00	$5000.00	-50.00%	Y	Pass

	Student ID	Student Name	2021 tuition	2022 tuition	Percent Growth	Paid	DSC 80 Final Grade	Date
0	A20104523	John Black	40000.0	50000.0	25.00%	False	89	2020-10-12
1	A20345992	Mark White	9200.0	10120.0	10.00%	True	90	2019-04-15
2	A21942188	Amy Red	50000.0	62500.0	25.00%	False	97	2021-05-14
3	A28049910	Tom Green	7000.0	9800.0	40.00%	True	54	2020-07-10
4	A27456704	Rose Pink	10000.0	5000.0	-50.00%	True	Pass	2021-03-03

	Student ID	Student Name	2021 tuition	2022 tuition	Percent Growth	Paid	DSC 80 Final Grade	Date
0	A20104523	Black, John	40000.0	50000.0	25.00%	False	89.0	2020-10-12
1	A20345992	White, Mark	9200.0	10120.0	10.00%	True	90.0	2019-04-15
2	A21942188	Red, Amy	50000.0	62500.0	25.00%	False	97.0	2021-05-14
3	A28049910	Green, Tom	7000.0	9800.0	40.00%	True	54.0	2020-07-10
4	A27456704	Pink, Rose	10000.0	5000.0	-50.00%	True	NaN	2021-03-03

Lecture 7 – Relational Algebra, Messy Data¶

DSC 80, Spring 2023¶

Agenda¶

Relational algebra¶

Birds-eye view of the course¶

Relational algebra¶

Projection ($\Pi$)¶

Selection ($\sigma$)¶

Cross product ($\times$)¶

Union ($\cup$)¶

Difference ($-$)¶

Brief summary¶

Introduction to messy data¶

There is no such thing as "raw data"!¶

Data generating process¶

Example: COVID case counts 🦠¶

Data provenance¶

Data cleaning 🧹¶

Keys to data cleaning¶

Kinds of data¶

Kinds of data¶

Discussion Question¶

Example: DSC 80 students¶

How much has each student paid in total tuition in 2021 and 2022?¶

Check the data types of students!¶

Cleaning '2021 tuition' and '2022 tuition'¶

Cleaning 'Paid'¶

Cleaning 'Month', 'Day', and 'Year'¶

Cleaning 'DSC 80 Final Grade'¶

Cleaning 'Student Name'¶

More data type ambiguities¶

Example: the Norway problem 🇳🇴¶

Unfaithful data¶

Is the data "faithful" to the DGP?¶

Is the data "faithful" to the DGP?¶

Summary, next time¶

Summary¶

Next time¶

Check the data types of `students`!¶

Cleaning `'2021 tuition'` and `'2022 tuition'`¶

Cleaning `'Paid'`¶

Cleaning `'Month'`, `'Day'`, and `'Year'`¶

Cleaning `'DSC 80 Final Grade'`¶

Cleaning `'Student Name'`¶