from dsc80_utils import *

<head>
    <title>3*Canada-2022-06-04</title>
</head>
<body>
    <h1>Spotify Top 3 - Canada</h1>
    <table>
        <tr class='heading'>
            <th>Rank</th>
            <th>Artist(s)</th> 
            <th>Song</th>
        </tr>
        <tr class=1>
            <td>1</td>
            <td>Harry Styles</td> 
            <td>As It Was</td>
        </tr>
        <tr class=2>
            <td>2</td>
            <td>Jack Harlow</td> 
            <td>First Class</td>
        </tr>
        <tr class=3>
            <td>3</td>
            <td>Kendrick Lamar</td> 
            <td>N95</td>
        </tr>
    </table>
</body>

html_as_string = '''
<head>
    <title>3*Canada-2022-06-04</title>
</head>
<body>
    <h1>Spotify Top 3 - Canada</h1>
    <table>
        <tr class='heading'>
            <th>Rank</th>
            <th>Artist(s)</th> 
            <th>Song</th>
        </tr>
        <tr class=1>
            <td>1</td>
            <td>Harry Styles</td> 
            <td>As It Was</td>
        </tr>
        <tr class=2>
            <td>2</td>
            <td>Jack Harlow</td> 
            <td>First Class</td>
        </tr>
        <tr class=3>
            <td>3</td>
            <td>Kendrick Lamar</td> 
            <td>N95</td>
        </tr>
    </table>
</body>'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html_as_string)

soup

<head>
<title>3*Canada-2022-06-04</title>
</head>
<body>
<h1>Spotify Top 3 - Canada</h1>
<table>
<tr class="heading">
<th>Rank</th>
<th>Artist(s)</th>
<th>Song</th>
</tr>
<tr class="1">
<td>1</td>
<td>Harry Styles</td>
<td>As It Was</td>
</tr>
<tr class="2">
<td>2</td>
<td>Jack Harlow</td>
<td>First Class</td>
</tr>
<tr class="3">
<td>3</td>
<td>Kendrick Lamar</td>
<td>N95</td>
</tr>
</table>
</body>

def top_nth(n):
    '''Returns the name of the nth ranked song in soup.'''
    return soup.find("tr", attrs={'class': n}).find_all("td")[-1].text

top_nth(3)

'N95'

contact = '''
Thank you for buying our expensive product!

If you have a complaint, please send it to complaints@compuserve.com or call (800) 867-5309.

If you are happy with your purchase, please call us at (800) 123-4567; we'd love to hear from you!

Due to high demand, please allow one-hundred (100) business days for a response.
'''

print(contact)

Thank you for buying our expensive product!

If you have a complaint, please send it to complaints@compuserve.com or call (800) 867-5309.

If you are happy with your purchase, please call us at (800) 123-4567; we'd love to hear from you!

Due to high demand, please allow one-hundred (100) business days for a response.

def is_possibly_area_code(s):
    '''Does `s` look like (678)?'''
    return (len(s) == 5 and
            s.startswith('(') and
            s.endswith(')') and
            s[1:4].isnumeric())

is_possibly_area_code('(123)')

True

is_possibly_area_code('(99)')

False

def is_last_7_phone_number(s):
    '''Does `s` look like 999-8212?'''
    return len(s) == 8 and s[0:3].isnumeric() and s[3] == '-' and s[4:].isnumeric()

is_last_7_phone_number('999-8212')

True

is_last_7_phone_number('534 1100')

False

print(contact)

Thank you for buying our expensive product!

If you have a complaint, please send it to complaints@compuserve.com or call (800) 867-5309.

If you are happy with your purchase, please call us at (800) 123-4567; we'd love to hear from you!

Due to high demand, please allow one-hundred (100) business days for a response.

# Removes punctuation from the end of each string.
pieces = [s.rstrip('.,?;"\'') for s in contact.split()]

for i in range(len(pieces) - 1):
    if is_possibly_area_code(pieces[i]):
        if is_last_7_phone_number(pieces[i+1]):
            print(pieces[i], pieces[i+1])

(800) 867-5309
(800) 123-4567

print(contact)

Thank you for buying our expensive product!

If you have a complaint, please send it to complaints@compuserve.com or call (800) 867-5309.

If you are happy with your purchase, please call us at (800) 123-4567; we'd love to hear from you!

Due to high demand, please allow one-hundred (100) business days for a response.

import re
re.findall(r'\(\d{3}\) \d{3}-\d{4}', contact)

['(800) 867-5309', '(800) 123-4567']

import re

re.search('AB*A', 
          'here is a string for you: ABBBA. here is another: ABBBBBBBA')

<re.Match object; span=(26, 31), match='ABBBA'>

re.findall('AB*A', 
           'here is a string for you: ABBBA. here is another: ABBBBBBBA')

['ABBBA', 'ABBBBBBBA']

re.sub

<function re.sub(pattern, repl, string, count=0, flags=0)>

re.sub('AB*A', 
       'billy', 
       'here is a string for you: ABBBA. here is another: ABBBBBBBA')

'here is a string for you: billy. here is another: billy'

re.findall('\bcat\b', 'my cat is hungry')

[]

re.findall(r'\bcat\b', 'my cat is hungry')

['cat']

# Huh?
print('\bcat\b')

cat

print(r'\bcat\b')

\bcat\b

re.findall(r'\w+@(\w+)\.edu', 
           'my old email was billy@notucsd.edu, my new email is notbilly@ucsd.edu')

['notucsd', 'ucsd']

re.findall(r'\w+@\w+\.edu', 
           'my old email was billy@notucsd.edu, my new email is notbilly@ucsd.edu')

['billy@notucsd.edu', 'notbilly@ucsd.edu']

# A regex that matches strings with two of the same vowel followed by 3 digits
# We only want to capture the digits, but...
re.findall(r'(aa|ee|ii|oo|uu)(\d{3})', 'eeoo124')[0]

('oo', '124')

s = '''132.249.20.188 - - [24/Feb/2023:12:26:15 -0800] "GET /my/home/ HTTP/1.1" 200 2585'''

exp = r'\[(.+)\/(.+)\/(.+):(.+):(.+):(.+) .+\]'
re.findall(exp, s)

[('24', 'Feb', '2023', '12', '26', '15')]

other_s = '[adr/jduy/wffsdffs:r4s4:4wsgdfd:asdf 7]'
re.findall(exp, other_s)

[('adr', 'jduy', 'wffsdffs', 'r4s4', '4wsgdfd', 'asdf')]

s

'132.249.20.188 - - [24/Feb/2023:12:26:15 -0800] "GET /my/home/ HTTP/1.1" 200 2585'

new_exp = '\[(\d{2})\/([A-Z]{1}[a-z]{2})\/(\d{4}):(\d{2}):(\d{2}):(\d{2}) -\d{4}\]'
re.findall(new_exp, s)

[('24', 'Feb', '2023', '12', '26', '15')]

other_s

'[adr/jduy/wffsdffs:r4s4:4wsgdfd:asdf 7]'

re.findall(new_exp, other_s)

[]

operation	order of op.	example	matches ✅	does not match ❌
concatenation	3	`AABAAB`	`'AABAAB'`	every other string
or	4	`AA\|BAAB`	`'AA'`, `'BAAB'`	every other string
closure (zero or more)	2	`AB*A`	`'AA'`, `'ABBBBBBA'`	`'AB'`, `'ABABA'`
parentheses	1	`A(A\|B)AAB` `(AB)*A`	`'AAAAB'`, `'ABAAB'` `'A'`, `'ABABABABA'`	every other string `'AA'`, `'ABBA'`

operation	example	matches ✅	does not match ❌
wildcard	`.U.U.U.`	`'CUMULUS'` `'JUGULUM'`	`'SUCCUBUS'` `'TUMULTUOUS'`
character class	`[A-Za-z][a-z]*`	`'word'` `'Capitalized'`	`'camelCase'` `'4illegal'`
at least one	`bi(ll)+y`	`'billy'` `'billlllly'`	`'biy'` `'bily'`
between $i$ and $j$ occurrences	`m[aeiou]{1,2}m`	`'mem'` `'maam'` `'miem'`	`'mm'` `'mooom'` `'meme'`

operation	example	matches ✅	does not match ❌
escape character	`ucsd\.edu`	`'ucsd.edu'`	`'ucsd!edu'`
beginning of line	`^ark`	`'ark two'` `'ark o ark'`	`'dark'`
end of line	`ark$`	`'dark'` `'ark o ark'`	`'ark two'`
zero or one	`cat?`	`'ca'` `'cat'`	`'cart'` (matches `'ca'` only)
built-in character classes*	`\w+` `\d+`	`'billy'` `'231231'`	`'this person'` `'858 people'`
character class negation	`[^a-z]+`	`'KINGTRITON551'` `'1721$$'`	`'porch'` `'billy.edu'`

Lecture 11 – Regular Expressions and Text Features¶

DSC 80, Winter 2024¶

Announcements 📣¶

Agenda 📆¶

Exercise

Motivation¶

Who called? 📞¶

Is there a better way?¶

🤯

Basic regular expressions¶

Regular expressions¶

Writing regular expressions¶

Literals¶

Regex building blocks 🧱¶

Exercise

Exercise

Intermediate regex¶

More regex syntax¶

Exercise

Exercise

Even more regex syntax¶

Exercise

Regex in Python¶

`re` in Python¶

Raw strings¶

Capture groups¶

Example: Log parsing¶

The more specific, the better!¶

Exercise

Limitations of regular expressions¶

Text features¶

Summary, next time¶

Summary¶

Next time¶

Lecture 11 – Regular Expressions and Text Features¶

DSC 80, Winter 2024¶

Announcements 📣¶

Agenda 📆¶

Exercise

Motivation¶

Who called? 📞¶

Is there a better way?¶

🤯

Basic regular expressions¶

Regular expressions¶

Writing regular expressions¶

Literals¶

Regex building blocks 🧱¶

Exercise

Exercise

Intermediate regex¶

More regex syntax¶

Exercise

Exercise

Even more regex syntax¶

Exercise

Regex in Python¶

re in Python¶

Raw strings¶

Capture groups¶

Example: Log parsing¶

The more specific, the better!¶

Exercise

Limitations of regular expressions¶

Text features¶

Summary, next time¶

Summary¶

Next time¶

`re` in Python¶