from dsc80_utils import *

contact = '''
Thank you for buying our expensive product!

If you have a complaint, please send it to complaints@compuserve.com or call (800) 867-5309.

If you are happy with your purchase, please call us at (800) 123-4567; we'd love to hear from you!

Due to high demand, please allow one-hundred (100) business days for a response.
'''

print(contact)

Thank you for buying our expensive product!

If you have a complaint, please send it to complaints@compuserve.com or call (800) 867-5309.

If you are happy with your purchase, please call us at (800) 123-4567; we'd love to hear from you!

Due to high demand, please allow one-hundred (100) business days for a response.

def is_possibly_area_code(s):
    '''Does `s` look like (678)?'''
    return (len(s) == 5 and
            s.startswith('(') and
            s.endswith(')') and
            s[1:4].isnumeric())

is_possibly_area_code('(123)')

True

is_possibly_area_code('(99)')

False

def is_last_7_phone_number(s):
    '''Does `s` look like 999-8212?'''
    return len(s) == 8 and s[0:3].isnumeric() and s[3] == '-' and s[4:].isnumeric()

is_last_7_phone_number('999-8212')

True

is_last_7_phone_number('534 1100')

False

# Removes punctuation from the end of each string.
pieces = [s.rstrip('.,?;"\'') for s in contact.split()]

for i in range(len(pieces) - 1):
    if is_possibly_area_code(pieces[i]):
        if is_last_7_phone_number(pieces[i+1]):
            print(pieces[i], pieces[i+1])

(800) 867-5309
(800) 123-4567

print(contact)

Thank you for buying our expensive product!

If you have a complaint, please send it to complaints@compuserve.com or call (800) 867-5309.

If you are happy with your purchase, please call us at (800) 123-4567; we'd love to hear from you!

Due to high demand, please allow one-hundred (100) business days for a response.

import re
re.findall(r'\(\d{3}\) \d{3}-\d{4}', contact)

['(800) 867-5309', '(800) 123-4567']

import re

re.search('AB*A', 
          'here is a string for you: ABBBA. here is another: ABBBBBBBA')

<re.Match object; span=(26, 31), match='ABBBA'>

re.findall('AB*A', 
           'here is a string for you: ABBBA. here is another: ABBBBBBBA')

['ABBBA', 'ABBBBBBBA']

re.sub('AB*A', 
       'billy', 
       'here is a string for you: ABBBA. here is another: ABBBBBBBA')

'here is a string for you: billy. here is another: billy'

re.findall('\bcat\b', 'my cat is hungry')

[]

re.findall(r'\bcat\b', 'my cat is hungry')

['cat']

# Huh?
print('\bcat\b')

cat

re.findall(r'\w+@(\w+)\.edu', 
           'my old email was billy@notucsd.edu, my new email is notbilly@ucsd.edu')

['notucsd', 'ucsd']

re.findall(r'\w+@\w+\.edu', 
           'my old email was billy@notucsd.edu, my new email is notbilly@ucsd.edu')

['billy@notucsd.edu', 'notbilly@ucsd.edu']

# A regex that matches strings with two of the same vowel followed by 3 digits
# We only want to capture the digits, but...
re.findall(r'(aa|ee|ii|oo|uu)(\d{3})', 'eeoo124')

[('oo', '124')]

s = '''132.249.20.188 - - [24/Feb/2023:12:26:15 -0800] "GET /my/home/ HTTP/1.1" 200 2585'''

exp = '\[(.+)\/(.+)\/(.+):(.+):(.+):(.+) .+\]'
re.findall(exp, s)

[('24', 'Feb', '2023', '12', '26', '15')]

other_s = '[adr/jduy/wffsdffs:r4s4:4wsgdfd:asdf 7]'
re.findall(exp, other_s)

[('adr', 'jduy', 'wffsdffs', 'r4s4', '4wsgdfd', 'asdf')]

s

'132.249.20.188 - - [24/Feb/2023:12:26:15 -0800] "GET /my/home/ HTTP/1.1" 200 2585'

new_exp = '\[(\d{2})\/([A-Z]{1}[a-z]{2})\/(\d{4}):(\d{2}):(\d{2}):(\d{2}) -\d{4}\]'
re.findall(new_exp, s)

[('24', 'Feb', '2023', '12', '26', '15')]

other_s

'[adr/jduy/wffsdffs:r4s4:4wsgdfd:asdf 7]'

re.findall(new_exp, other_s)

[]

salaries = pd.read_csv('https://transcal.s3.amazonaws.com/public/export/san-diego-2021.csv')
salaries['Employee Name'] = salaries['Employee Name'].str.split().str[0] + ' Xxxx'

salaries.head()

jobtitles = salaries['Job Title']
jobtitles.head()

0                 City Attorney
1                         Mayor
2            Investment Officer
3                Police Officer
4    Independent Budget Analyst
Name: Job Title, dtype: object

jobtitles.shape[0], jobtitles.nunique()

(12305, 588)

jobtitles.value_counts().iloc[:100]

Police Officer                2123
Fire Fighter Ii                331
Assistant Engineer - Civil     284
                              ... 
Executive Assistant             26
Paralegal                       26
Librarian Iv                    25
Name: Job Title, Length: 100, dtype: int64

jobtitles.value_counts().iloc[:25].sort_values().plot(kind='barh')

jobtitles.isna().sum()

2

jobtitles = jobtitles[jobtitles.notna()]

# Uses character class negation
jobtitles.str.contains(r'[^A-Za-z0-9 ]', regex=True).sum()

845

jobtitles[jobtitles.str.contains(r'[^A-Za-z0-9 ]', regex=True)].head()

281          Park & Recreation Director
539     Associate Engineer - Mechanical
1023         Associate Engineer - Civil
1376       Associate Engineer - Traffic
1460       Budget/Legislative Analyst I
Name: Job Title, dtype: object

# Why are we converting to lowercase?
jobtitles.str.lower().str.contains('to').sum()

1541

jobtitles[jobtitles.str.lower().str.contains('to')]

0                             City Attorney
10       Assistant Retirement Administrator
25                      Department Director
                        ...                
12190                       Deputy Director
12210            City Attorney Investigator
12267                       Test Monitor Ii
Name: Job Title, Length: 1541, dtype: object

jobtitles.str.lower().str.contains(r'\bto\b', regex=True).sum()

11

jobtitles[jobtitles.str.lower().str.contains(r'\bto\b', regex=True)]

664               Assistant To The Fire Chief
1403     Principal Assistant To City Attorney
2358                Assistant To The Director
                         ...                 
7544          Confidential Secretary To Mayor
9627     Principal Assistant To City Attorney
12061               Assistant To The Director
Name: Job Title, Length: 11, dtype: object

jobtitles[jobtitles.str.lower().str.contains(r'\bthe\b', regex=True)]

664      Assistant To The Fire Chief
2358       Assistant To The Director
4459       Assistant To The Director
5685       Assistant To The Director
12061      Assistant To The Director
Name: Job Title, dtype: object

jobtitles[jobtitles.str.lower().str.contains(r'\bfor\b', regex=True)]

3676     Assistant For Community Outreach
4451     Assistant For Community Outreach
11010    Assistant For Community Outreach
Name: Job Title, dtype: object

jobtitles = (
    jobtitles
    .str.lower()
    .str.replace(r'\bto\b|\bthe\b|\bfor\b', '', regex=True)
    .str.replace('[^A-Za-z0-9 ]', ' ', regex=True)
    .str.replace(' +', ' ', regex=True)               # ' +' matches 1 or more occurrences of a space.
    .str.strip()                                      # Removes leading/trailing spaces if present.
)

jobtitles.sample(10)

5584               senior clerk typist
11658     assistant management analyst
8779     grounds maintenance worker ii
                     ...              
5751          associate engineer civil
9138     grounds maintenance worker ii
8213       payroll audit specialist ii
Name: Job Title, Length: 10, dtype: object

jobtitles[jobtitles.str.contains('asst')].value_counts()

Series([], Name: Job Title, dtype: int64)

jobtitles[jobtitles.str.contains('assistant')].value_counts().head()

assistant engineer civil    284
library assistant i         127
library assistant ii        116
library assistant iii       107
clerical assistant ii       100
Name: Job Title, dtype: int64

operation	order of op.	example	matches ✅	does not match ❌
concatenation	3	`AABAAB`	`'AABAAB'`	every other string
or	4	`AA\|BAAB`	`'AA'`, `'BAAB'`	every other string
closure (zero or more)	2	`AB*A`	`'AA'`, `'ABBBBBBA'`	`'AB'`, `'ABABA'`
parentheses	1	`A(A\|B)AAB` `(AB)*A`	`'AAAAB'`, `'ABAAB'` `'A'`, `'ABABABABA'`	every other string `'AA'`, `'ABBA'`

operation	example	matches ✅	does not match ❌
wildcard	`.U.U.U.`	`'CUMULUS'` `'JUGULUM'`	`'SUCCUBUS'` `'TUMULTUOUS'`
character class	`[A-Za-z][a-z]*`	`'word'` `'Capitalized'`	`'camelCase'` `'4illegal'`
at least one	`bi(ll)+y`	`'billy'` `'billlllly'`	`'biy'` `'bily'`
between $i$ and $j$ occurrences	`m[aeiou]{1,2}m`	`'mem'` `'maam'` `'miem'`	`'mm'` `'mooom'` `'meme'`

operation	example	matches ✅	does not match ❌
escape character	`ucsd\.edu`	`'ucsd.edu'`	`'ucsd!edu'`
beginning of line	`^ark`	`'ark two'` `'ark o ark'`	`'dark'`
end of line	`ark$`	`'dark'` `'ark o ark'`	`'ark two'`
zero or one	`cat?`	`'ca'` `'cat'`	`'cart'` (matches `'ca'` only)
built-in character classes*	`\w+` `\d+`	`'billy'` `'231231'`	`'this person'` `'858 people'`
character class negation	`[^a-z]+`	`'KINGTRITON551'` `'1721$$'`	`'porch'` `'billy.edu'`

	Employee Name	Job Title	Base Pay	...	Year	Notes	Agency	Status
0	Mara Xxxx	City Attorney	218759.0	...	2021	NaN	San Diego	FT
1	Todd Xxxx	Mayor	218759.0	...	2021	NaN	San Diego	FT
2	Elizabeth Xxxx	Investment Officer	259732.0	...	2021	NaN	San Diego	FT
3	Terence Xxxx	Police Officer	212837.0	...	2021	NaN	San Diego	FT
4	Andrea Xxxx	Independent Budget Analyst	224312.0	...	2021	NaN	San Diego	FT

	senior	lecturer	teaching	professor	assistant	associate
senior lecturer	1	1	0	0	0	0
assistant teaching professor	0	0	1	1	1	0
associate professor	0	0	0	1	0	1
senior assistant to the assistant professor	1	0	0	1	2	0

Lecture 11 – Regular Expressions¶

DSC 80, Fall 2023¶

📣 Announcements 📣¶

Agenda¶

Motivation¶

Who called? 📞¶

Is there a better way?¶

🤯

Basic regular expressions¶

Regular expressions¶

Writing regular expressions¶

Literals¶

Regex building blocks 🧱¶

Exercise¶

Exercise¶

Intermediate regex¶

More regex syntax¶

Exercise¶

Exercise¶

Even more regex syntax¶

Example (built-in character classes)¶

Exercise¶

Regex in Python¶

re in Python¶

Raw strings¶

Capture groups¶

Example: Log parsing¶

The more specific, the better!¶

Limitations¶

Limitations of regexes¶

Text features¶

Review: Regression and features¶

Moving forward¶

Text features¶

Example: San Diego employee salaries¶

Aside on privacy and ethics¶

Goal: Quantifying similarity¶

Exploring job titles¶

Canonicalization¶

Punctuation¶

"Glue" words¶

Fixing punctuation and removing "glue" words¶

Possible issue: inconsistent representations¶

Bag of words 💰¶

Text similarity¶

A counts matrix¶

`re` in Python¶