import pandas as pd
import numpy as np
import os


temps = pd.DataFrame({
    'City': ['San Diego', 'Toronto', 'Rome'],
    'Temperature': [76, 28, 56],
    'Humid': ['No', 'Yes', 'Yes']
})

other_temps = pd.DataFrame({
    'City': ['Los Angeles', 'San Diego', 'Miami'],
    'Temperature': [79, 76, 88],
    'Humid': ['No', 'No', 'Yes']
})

countries = pd.DataFrame({
    'City': ['Toronto', 'Shanghai', 'San Diego'],
    'Country': ['Canada', 'China', 'USA']
})


temps


other_temps


countries


temps[['City', 'Humid']].drop_duplicates()


temps[temps['Temperature'] > 50]


temps.loc[temps['Temperature'] > 50, ['City', 'Humid']].drop_duplicates()


# Could also use temps.merge(countries, how='cross').
pd.merge(temps, countries, how='cross')


both = pd.merge(temps, countries, how='cross')
both[both['City_x'] == both['City_y']]


temps.merge(countries)


pd.concat([temps, other_temps]).drop_duplicates()


temps[~temps['City'].isin(other_temps['City'])]


students = pd.read_csv(os.path.join('data', 'students.csv'))
students


students


total = students['2021 tuition'] + students['2022 tuition']
total

0    $40000.00$50000.00
1     $9200.00$10120.00
2    $50000.00$62500.00
3      $7000.00$9800.00
4     $10000.00$5000.00
dtype: object


students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Student ID          5 non-null      object
 1   Student Name        5 non-null      object
 2   Month               5 non-null      int64 
 3   Day                 5 non-null      int64 
 4   Year                5 non-null      int64 
 5   2021 tuition        5 non-null      object
 6   2022 tuition        5 non-null      object
 7   Percent Growth      5 non-null      object
 8   Paid                5 non-null      object
 9   DSC 80 Final Grade  5 non-null      object
dtypes: int64(3), object(7)
memory usage: 528.0+ bytes


# This won't work. Why?
students['2021 tuition'].astype(float)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/var/folders/pd/w73mdrsj2836_7gp0brr2q7r0000gn/T/ipykernel_68707/2133079742.py in <module>
      1 # This won't work. Why?
----> 2 students['2021 tuition'].astype(float)

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
   6238         else:
   6239             # else, only a single dtype is given
-> 6240             new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   6241             return self._constructor(new_data).__finalize__(self, method="astype")
   6242 

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
    443 
    444     def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
--> 445         return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
    446 
    447     def convert(

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
    345                     applied = b.apply(f, **kwargs)
    346                 else:
--> 347                     applied = getattr(b, f)(**kwargs)
    348             except (TypeError, NotImplementedError):
    349                 if not ignore_failures:

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
    524         values = self.values
    525 
--> 526         new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
    527 
    528         new_values = maybe_coerce_values(new_values)

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/dtypes/astype.py in astype_array_safe(values, dtype, copy, errors)
    297 
    298     try:
--> 299         new_values = astype_array(values, dtype, copy=copy)
    300     except (ValueError, TypeError):
    301         # e.g. astype_nansafe can fail on object-dtype of strings

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/dtypes/astype.py in astype_array(values, dtype, copy)
    228 
    229     else:
--> 230         values = astype_nansafe(values, dtype, copy=copy)
    231 
    232     # in pandas we don't store numpy str dtypes, so convert to object

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/dtypes/astype.py in astype_nansafe(arr, dtype, copy, skipna)
    168     if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype):
    169         # Explicit copy, or required since NumPy can't view from / to object.
--> 170         return arr.astype(dtype, copy=True)
    171 
    172     return arr.astype(dtype, copy=copy)

ValueError: could not convert string to float: '$40000.00'


# That's better!
students['2021 tuition'].str.strip('$').astype(float)

0    40000.0
1     9200.0
2    50000.0
3     7000.0
4    10000.0
Name: 2021 tuition, dtype: float64


for col in students.columns:
    if 'tuition' in col:
        students[col] = students[col].str.strip('$').astype(float)
        
students


cols = students.columns.str.contains('tuition')
students.loc[:, cols] = students.loc[:, cols].astype(float)
students


students['Paid'].value_counts()

Y    3
N    2
Name: Paid, dtype: int64


students['Paid'] = students['Paid'] == 'Y'
students


students.loc[:, 'Month': 'Year']


students['Date'] = pd.to_datetime(students.loc[:, 'Month': 'Year'])
students = students.drop(columns=['Month', 'Day', 'Year'])
students


# Won't work!
students['DSC 80 Final Grade'].astype(int)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/var/folders/pd/w73mdrsj2836_7gp0brr2q7r0000gn/T/ipykernel_68707/1326609327.py in <module>
      1 # Won't work!
----> 2 students['DSC 80 Final Grade'].astype(int)

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
   6238         else:
   6239             # else, only a single dtype is given
-> 6240             new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   6241             return self._constructor(new_data).__finalize__(self, method="astype")
   6242 

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
    443 
    444     def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
--> 445         return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
    446 
    447     def convert(

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
    345                     applied = b.apply(f, **kwargs)
    346                 else:
--> 347                     applied = getattr(b, f)(**kwargs)
    348             except (TypeError, NotImplementedError):
    349                 if not ignore_failures:

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
    524         values = self.values
    525 
--> 526         new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
    527 
    528         new_values = maybe_coerce_values(new_values)

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/dtypes/astype.py in astype_array_safe(values, dtype, copy, errors)
    297 
    298     try:
--> 299         new_values = astype_array(values, dtype, copy=copy)
    300     except (ValueError, TypeError):
    301         # e.g. astype_nansafe can fail on object-dtype of strings

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/dtypes/astype.py in astype_array(values, dtype, copy)
    228 
    229     else:
--> 230         values = astype_nansafe(values, dtype, copy=copy)
    231 
    232     # in pandas we don't store numpy str dtypes, so convert to object

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/dtypes/astype.py in astype_nansafe(arr, dtype, copy, skipna)
    168     if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype):
    169         # Explicit copy, or required since NumPy can't view from / to object.
--> 170         return arr.astype(dtype, copy=True)
    171 
    172     return arr.astype(dtype, copy=copy)

ValueError: invalid literal for int() with base 10: 'Pass'


pd.to_numeric(students['DSC 80 Final Grade'], errors='coerce')

0    89.0
1    90.0
2    97.0
3    54.0
4     NaN
Name: DSC 80 Final Grade, dtype: float64


students['DSC 80 Final Grade'] = pd.to_numeric(students['DSC 80 Final Grade'], errors='coerce')
students


pd.to_numeric?


students['Student Name']

0    John Black
1    Mark White
2       Amy Red
3     Tom Green
4     Rose Pink
Name: Student Name, dtype: object


parts = students['Student Name'].str.split()
parts

0    [John, Black]
1    [Mark, White]
2       [Amy, Red]
3     [Tom, Green]
4     [Rose, Pink]
Name: Student Name, dtype: object


students['Student Name'] = parts.str[1] + ', ' + parts.str[0]
students


import yaml

player = '''
name: Magnus Carlsen
age: 32
country: NO
'''


yaml.safe_load(player)

{'name': 'Magnus Carlsen', 'age': 32, 'country': False}

Lecture 7 – Relational Algebra, Messy Data¶

DSC 80, Winter 2023¶

Announcements¶

Agenda¶

Relational algebra¶

Birds-eye view of the course¶

Relational algebra¶

Projection ($\Pi$)¶

Selection ($\sigma$)¶

Cross product ($\times$)¶

Union ($\cup$)¶

Difference ($-$)¶

Brief summary¶

Introduction to messy data¶

There is no such thing as "raw data"!¶

Data generating process¶

Example: COVID case counts 🦠¶

Data provenance¶

Data cleaning 🧹¶

Keys to data cleaning¶

Kinds of data¶

Kinds of data¶

Discussion Question¶

Example: DSC 80 students¶

How much has each student paid in total tuition in 2021 and 2022?¶

Check the data types of `students`!¶

Cleaning `'2021 tuition'` and `'2022 tuition'`¶

Cleaning `'Paid'`¶

Cleaning `'Month'`, `'Day'`, and `'Year'`¶

Cleaning `'DSC 80 Final Grade'`¶

Cleaning `'Student Name'`¶

More data type ambiguities¶

Example: the Norway problem 🇳🇴¶

Summary, next time¶

Summary¶

Next time¶

	City_x	Temperature	Humid	City_y	Country
0	San Diego	76	No	Toronto	Canada
1	San Diego	76	No	Shanghai	China
2	San Diego	76	No	San Diego	USA
3	Toronto	28	Yes	Toronto	Canada
4	Toronto	28	Yes	Shanghai	China
5	Toronto	28	Yes	San Diego	USA
6	Rome	56	Yes	Toronto	Canada
7	Rome	56	Yes	Shanghai	China
8	Rome	56	Yes	San Diego	USA

	Student ID	Student Name	Month	Day	Year	2021 tuition	2022 tuition	Percent Growth	Paid	DSC 80 Final Grade
0	A20104523	John Black	10	12	2020	$40000.00	$50000.00	25.00%	N	89
1	A20345992	Mark White	4	15	2019	$9200.00	$10120.00	10.00%	Y	90
2	A21942188	Amy Red	5	14	2021	$50000.00	$62500.00	25.00%	N	97
3	A28049910	Tom Green	7	10	2020	$7000.00	$9800.00	40.00%	Y	54
4	A27456704	Rose Pink	3	3	2021	$10000.00	$5000.00	-50.00%	Y	Pass

	Student ID	Student Name	2021 tuition	2022 tuition	Percent Growth	Paid	DSC 80 Final Grade	Date
0	A20104523	John Black	40000.0	50000.0	25.00%	False	89	2020-10-12
1	A20345992	Mark White	9200.0	10120.0	10.00%	True	90	2019-04-15
2	A21942188	Amy Red	50000.0	62500.0	25.00%	False	97	2021-05-14
3	A28049910	Tom Green	7000.0	9800.0	40.00%	True	54	2020-07-10
4	A27456704	Rose Pink	10000.0	5000.0	-50.00%	True	Pass	2021-03-03

	Student ID	Student Name	2021 tuition	2022 tuition	Percent Growth	Paid	DSC 80 Final Grade	Date
0	A20104523	Black, John	40000.0	50000.0	25.00%	False	89.0	2020-10-12
1	A20345992	White, Mark	9200.0	10120.0	10.00%	True	90.0	2019-04-15
2	A21942188	Red, Amy	50000.0	62500.0	25.00%	False	97.0	2021-05-14
3	A28049910	Green, Tom	7000.0	9800.0	40.00%	True	54.0	2020-07-10
4	A27456704	Pink, Rose	10000.0	5000.0	-50.00%	True	NaN	2021-03-03

Lecture 7 – Relational Algebra, Messy Data¶

DSC 80, Winter 2023¶

Announcements¶

Agenda¶

Relational algebra¶

Birds-eye view of the course¶

Relational algebra¶

Projection ($\Pi$)¶

Selection ($\sigma$)¶

Cross product ($\times$)¶

Union ($\cup$)¶

Difference ($-$)¶

Brief summary¶

Introduction to messy data¶

There is no such thing as "raw data"!¶

Data generating process¶

Example: COVID case counts 🦠¶

Data provenance¶

Data cleaning 🧹¶

Keys to data cleaning¶

Kinds of data¶

Kinds of data¶

Discussion Question¶

Example: DSC 80 students¶

How much has each student paid in total tuition in 2021 and 2022?¶

Check the data types of students!¶

Cleaning '2021 tuition' and '2022 tuition'¶

Cleaning 'Paid'¶

Cleaning 'Month', 'Day', and 'Year'¶

Cleaning 'DSC 80 Final Grade'¶

Cleaning 'Student Name'¶

More data type ambiguities¶

Example: the Norway problem 🇳🇴¶

Summary, next time¶

Summary¶

Next time¶

Check the data types of `students`!¶

Cleaning `'2021 tuition'` and `'2022 tuition'`¶

Cleaning `'Paid'`¶

Cleaning `'Month'`, `'Day'`, and `'Year'`¶

Cleaning `'DSC 80 Final Grade'`¶

Cleaning `'Student Name'`¶