from dsc80_utils import *

<head>
    <title>3*Canada-2022-06-04</title>
</head>
<body>
    <h1>Spotify Top 3 - Canada</h1>
    <table>
        <tr class='heading'>
            <th>Rank</th>
            <th>Artist(s)</th> 
            <th>Song</th>
        </tr>
        <tr class=1>
            <td>1</td>
            <td>Harry Styles</td> 
            <td>As It Was</td>
        </tr>
        <tr class=2>
            <td>2</td>
            <td>Jack Harlow</td> 
            <td>First Class</td>
        </tr>
        <tr class=3>
            <td>3</td>
            <td>Kendrick Lamar</td> 
            <td>N95</td>
        </tr>
    </table>
</body>

contact = '''
Thank you for buying our expensive product!

If you have a complaint, please send it to complaints@compuserve.com or call (800) 867-5309.

If you are happy with your purchase, please call us at (800) 123-4567; we'd love to hear from you!

Due to high demand, please allow one-hundred (100) business days for a response.
'''

print(contact)

Thank you for buying our expensive product!

If you have a complaint, please send it to complaints@compuserve.com or call (800) 867-5309.

If you are happy with your purchase, please call us at (800) 123-4567; we'd love to hear from you!

Due to high demand, please allow one-hundred (100) business days for a response.

def is_possibly_area_code(s):
    '''Does `s` look like (678)?'''
    return (len(s) == 5 and
            s.startswith('(') and
            s.endswith(')') and
            s[1:4].isnumeric())

is_possibly_area_code('(123)')

True

is_possibly_area_code('(99)')

False

def is_last_7_phone_number(s):
    '''Does `s` look like 999-8212?'''
    return len(s) == 8 and s[0:3].isnumeric() and s[3] == '-' and s[4:].isnumeric()

is_last_7_phone_number('999-8212')

True

is_last_7_phone_number('534 1100')

False

# Removes punctuation from the end of each string.
pieces = [s.rstrip('.,?;"\'') for s in contact.split()]

for i in range(len(pieces) - 1):
    if is_possibly_area_code(pieces[i]):
        if is_last_7_phone_number(pieces[i+1]):
            print(pieces[i], pieces[i+1])

(800) 867-5309
(800) 123-4567

print(contact)

Thank you for buying our expensive product!

If you have a complaint, please send it to complaints@compuserve.com or call (800) 867-5309.

If you are happy with your purchase, please call us at (800) 123-4567; we'd love to hear from you!

Due to high demand, please allow one-hundred (100) business days for a response.

import re
re.findall(r'\(\d{3}\) \d{3}-\d{4}', contact)

['(800) 867-5309', '(800) 123-4567']

import re

re.search('AB*A', 
          'here is a string for you: ABBBA. here is another: ABBBBBBBA')

<re.Match object; span=(26, 31), match='ABBBA'>

re.findall('AB*A', 
           'here is a string for you: ABBBA. here is another: ABBBBBBBA')

['ABBBA', 'ABBBBBBBA']

re.sub('AB*A', 
       'billy', 
       'here is a string for you: ABBBA. here is another: ABBBBBBBA')

'here is a string for you: billy. here is another: billy'

re.findall('\bcat\b', 'my cat is hungry')

[]

re.findall(r'\bcat\b', 'my cat is hungry')

['cat']

# Huh?
print('\bcat\b')

cat

re.findall(r'\w+@(\w+)\.edu', 
           'my old email was billy@notucsd.edu, my new email is notbilly@ucsd.edu')

['notucsd', 'ucsd']

re.findall(r'\w+@\w+\.edu', 
           'my old email was billy@notucsd.edu, my new email is notbilly@ucsd.edu')

['billy@notucsd.edu', 'notbilly@ucsd.edu']

# A regex that matches strings with two of the same vowel followed by 3 digits
# We only want to capture the digits, but...
re.findall(r'(aa|ee|ii|oo|uu)(\d{3})', 'eeoo124')

[('oo', '124')]

s = '''132.249.20.188 - - [24/Feb/2023:12:26:15 -0800] "GET /my/home/ HTTP/1.1" 200 2585'''

exp = '\[(.+)\/(.+)\/(.+):(.+):(.+):(.+) .+\]'
re.findall(exp, s)

[('24', 'Feb', '2023', '12', '26', '15')]

other_s = '[adr/jduy/wffsdffs:r4s4:4wsgdfd:asdf 7]'
re.findall(exp, other_s)

[('adr', 'jduy', 'wffsdffs', 'r4s4', '4wsgdfd', 'asdf')]

s

'132.249.20.188 - - [24/Feb/2023:12:26:15 -0800] "GET /my/home/ HTTP/1.1" 200 2585'

new_exp = '\[(\d{2})\/([A-Z]{1}[a-z]{2})\/(\d{4}):(\d{2}):(\d{2}):(\d{2}) -\d{4}\]'
re.findall(new_exp, s)

[('24', 'Feb', '2023', '12', '26', '15')]

other_s

'[adr/jduy/wffsdffs:r4s4:4wsgdfd:asdf 7]'

re.findall(new_exp, other_s)

[]

salaries = pd.read_csv('https://transcal.s3.amazonaws.com/public/export/san-diego-2022.csv')
salaries['Employee Name'] = salaries['Employee Name'].str.split().str[0] + ' Xxxx'

0                         City Attorney
1                                 Mayor
2                Assistant Police Chief
3                       Police Sergeant
4    Assistant Retirement Administrator
Name: Job Title, dtype: object

(12831, 611)

Police Officer Ii               1082
Police Sergeant                  311
Fire Fighter Ii                  306
                                ... 
Public Works Supervisor           29
Project Assistant                 29
Associate Engineer - Traffic      29
Name: Job Title, Length: 100, dtype: int64

0

922

137          Park & Recreation Director
248     Associate Engineer - Mechanical
734     Associate Engineer - Electrical
882        Associate Engineer - Traffic
1045         Associate Engineer - Civil
Name: Job Title, dtype: object

1577

0                             City Attorney
4        Assistant Retirement Administrator
8                  Retirement Administrator
                        ...                
12778                        Test Monitor I
12812                           Custodian I
12826              Word Processing Operator
Name: Job Title, Length: 1577, dtype: object

10

1638              Assistant To The Chief Operating Officer
2183                  Principal Assistant To City Attorney
2238                             Assistant To The Director
                               ...                        
6594     Confidential Secretary To Chief Operating Officer
6832                       Confidential Secretary To Mayor
11028                      Confidential Secretary To Mayor
Name: Job Title, Length: 10, dtype: object

salaries = pd.read_csv('https://transcal.s3.amazonaws.com/public/export/san-diego-2022.csv')
salaries['Employee Name'] = salaries['Employee Name'].str.split().str[0] + ' Xxxx'

salaries.head()

jobtitles = salaries['Job Title']
jobtitles.head()

0                         City Attorney
1                                 Mayor
2                Assistant Police Chief
3                       Police Sergeant
4    Assistant Retirement Administrator
Name: Job Title, dtype: object

jobtitles.shape[0], jobtitles.nunique()

(12831, 611)

jobtitles.value_counts().iloc[:100]

Police Officer Ii               1082
Police Sergeant                  311
Fire Fighter Ii                  306
                                ... 
Public Works Supervisor           29
Project Assistant                 29
Associate Engineer - Traffic      29
Name: Job Title, Length: 100, dtype: int64

jobtitles.value_counts().iloc[:10].sort_values().plot(kind='barh')

jobtitles.isna().sum()

0

# Uses character class negation.
jobtitles.str.contains(r'[^A-Za-z0-9 ]', regex=True).sum()

922

jobtitles[jobtitles.str.contains(r'[^A-Za-z0-9 ]', regex=True)].head()

137          Park & Recreation Director
248     Associate Engineer - Mechanical
734     Associate Engineer - Electrical
882        Associate Engineer - Traffic
1045         Associate Engineer - Civil
Name: Job Title, dtype: object

# Why are we converting to lowercase?
jobtitles.str.lower().str.contains('to').sum()

1577

jobtitles[jobtitles.str.lower().str.contains('to')]

0                             City Attorney
4        Assistant Retirement Administrator
8                  Retirement Administrator
                        ...                
12778                        Test Monitor I
12812                           Custodian I
12826              Word Processing Operator
Name: Job Title, Length: 1577, dtype: object

jobtitles.str.lower().str.contains(r'\bto\b', regex=True).sum()

10

jobtitles[jobtitles.str.lower().str.contains(r'\bto\b', regex=True)]

1638              Assistant To The Chief Operating Officer
2183                  Principal Assistant To City Attorney
2238                             Assistant To The Director
                               ...                        
6594     Confidential Secretary To Chief Operating Officer
6832                       Confidential Secretary To Mayor
11028                      Confidential Secretary To Mayor
Name: Job Title, Length: 10, dtype: object

jobtitles[jobtitles.str.lower().str.contains(r'\bthe\b', regex=True)]

1638    Assistant To The Chief Operating Officer
2238                   Assistant To The Director
5609                   Assistant To The Director
6544                   Assistant To The Director
Name: Job Title, dtype: object

jobtitles[jobtitles.str.lower().str.contains(r'\bfor\b', regex=True)]

3449     Assistant For Community Outreach
6889     Assistant For Community Outreach
10810    Assistant For Community Outreach
Name: Job Title, dtype: object

jobtitles[jobtitles.str.lower().str.contains(r'\bi+v?\b', regex=True)]

5                   Police Officer Ii
10                  Police Officer Ii
48       Fire Prevention Inspector Ii
                     ...             
12822           Clerical Assistant Ii
12828               Police Officer Ii
12830                Police Officer I
Name: Job Title, Length: 6087, dtype: object

jobtitles = (
    jobtitles
    .str.lower()
    .str.replace(r'\bto\b|\bthe\b|\bfor\b', '', regex=True)
    .str.replace(r'[^A-Za-z0-9 ]', ' ', regex=True)
    .str.replace(r'\bi+v?\b', '', regex=True)
    .str.replace(r' +', ' ', regex=True)               # ' +' matches 1 or more occurrences of a space.
    .str.strip()                                       # Removes leading/trailing spaces if present.
)

jobtitles.sample(5)

2011       program manager
8150        police officer
10000    library assistant
9457        utility worker
7341        utility worker
Name: Job Title, dtype: object

(jobtitles == 'police officer').sum()

1378

jobtitles[jobtitles.str.contains('asst')].value_counts()

Series([], Name: Job Title, dtype: int64)

jobtitles[jobtitles.str.contains('assistant')].value_counts().head()

library assistant            385
assistant engineer civil     297
clerical assistant           111
assistant center director     50
assistant chemist             45
Name: Job Title, dtype: int64

operation	order of op.	example	matches ✅	does not match ❌
concatenation	3	`AABAAB`	`'AABAAB'`	every other string
or	4	`AA\|BAAB`	`'AA'`, `'BAAB'`	every other string
closure (zero or more)	2	`AB*A`	`'AA'`, `'ABBBBBBA'`	`'AB'`, `'ABABA'`
parentheses	1	`A(A\|B)AAB` `(AB)*A`	`'AAAAB'`, `'ABAAB'` `'A'`, `'ABABABABA'`	every other string `'AA'`, `'ABBA'`

operation	example	matches ✅	does not match ❌
wildcard	`.U.U.U.`	`'CUMULUS'` `'JUGULUM'`	`'SUCCUBUS'` `'TUMULTUOUS'`
character class	`[A-Za-z][a-z]*`	`'word'` `'Capitalized'`	`'camelCase'` `'4illegal'`
at least one	`bi(ll)+y`	`'billy'` `'billlllly'`	`'biy'` `'bily'`
between $i$ and $j$ occurrences	`m[aeiou]{1,2}m`	`'mem'` `'maam'` `'miem'`	`'mm'` `'mooom'` `'meme'`

operation	example	matches ✅	does not match ❌
escape character	`ucsd\.edu`	`'ucsd.edu'`	`'ucsd!edu'`
beginning of line	`^ark`	`'ark two'` `'ark o ark'`	`'dark'`
end of line	`ark$`	`'dark'` `'ark o ark'`	`'ark two'`
zero or one	`cat?`	`'ca'` `'cat'`	`'cart'` (matches `'ca'` only)
built-in character classes*	`\w+` `\d+`	`'billy'` `'231231'`	`'this person'` `'858 people'`
character class negation	`[^a-z]+`	`'KINGTRITON551'` `'1721$$'`	`'porch'` `'billy.edu'`

	Employee Name	Job Title	Base Pay	Overtime Pay	...	Year	Notes	Agency	Status
0	Mara Xxxx	City Attorney	227441.53	0.00	...	2022	NaN	San Diego	FT
1	Todd Xxxx	Mayor	227441.53	0.00	...	2022	NaN	San Diego	FT
2	Terence Xxxx	Assistant Police Chief	227224.32	0.00	...	2022	NaN	San Diego	FT
3	Esmeralda Xxxx	Police Sergeant	124604.40	162506.54	...	2022	NaN	San Diego	FT
4	Marcelle Xxxx	Assistant Retirement Administrator	279868.04	0.00	...	2022	NaN	San Diego	FT

	senior	lecturer	teaching	professor	assistant	associate
senior lecturer	1	1	0	0	0	0
assistant teaching professor	0	0	1	1	1	0
associate professor	0	0	0	1	0	1
senior assistant to the assistant professor	1	0	0	1	2	0

Lecture 11 – Regular Expressions and Text Features¶

DSC 80, Winter 2024¶

Announcements 📣¶

Agenda 📆¶

Exercise

Motivation¶

Who called? 📞¶

Is there a better way?¶

🤯

Basic regular expressions¶

Regular expressions¶

Writing regular expressions¶

Literals¶

Regex building blocks 🧱¶

Exercise

Exercise

Intermediate regex¶

More regex syntax¶

Exercise

Exercise

Even more regex syntax¶

Exercise

Regex in Python¶

re in Python¶

Raw strings¶

Capture groups¶

Example: Log parsing¶

The more specific, the better!¶

Exercise

Limitations of regular expressions¶

Text features¶

Review: Regression and features¶

Moving forward¶

Text features¶

Example: San Diego employee salaries¶

Aside on privacy and ethics¶

Goal: Quantifying similarity¶

Exploring job titles¶

Canonicalization¶

Punctuation¶

"Glue" words¶

Roman numerals (e.g. "Ii")¶

Fixing punctuation and removing "glue" words and roman numerals¶

Possible issue: inconsistent representations¶

Bag of words 💰¶

Text similarity¶

A counts matrix¶

Summary, next time¶

Summary¶

Next time¶

`re` in Python¶