import pandas as pd
import numpy as np
import os

import requests
import bs4
import json
from IPython.display import HTML, Image


fac_response = requests.get('https://datascience.ucsd.edu/about/faculty/faculty/')
fac_response

<Response [200]>


soup = bs4.BeautifulSoup(fac_response.text)


divs = soup.find_all('div', attrs={'data-entry-type': 'individual'})


divs[0]

<div class="cn-list-row cn-list-item vcard individual faculty lecturers" data-entry-id="229" data-entry-slug="rod-albuyeh" data-entry-type="individual" id="rod-albuyeh">
<div class="cn-entry cn-accordion" id="entry-id-2296272d556adb37">
<div class="cn-left" style="min-width: 215px;">
<span class="cn-image-style"><span style="display: block; max-width: 100%; width: 215px"><img alt="Photo of Rod Albuyeh" class="cn-image photo" height="215" lazyload="1" loading="lazy" sizes="100vw" srcset="//datascience.ucsd.edu/wp-content/uploads/connections-images/rod-albuyeh/Rod-Albuyeh-Web-07dd8c651b197a11107f1c858ce1e390.jpg 1x" title="Photo of Rod Albuyeh" width="215"/></span></span>
</div> <!-- end cn-left-->
<div class="cn-right">
<h3 style="border-bottom: #182A48 1px solid; color:#182A48;"><a href="https://datascience.ucsd.edu/about/faculty/faculty/name/rod-albuyeh/" title="Rod Albuyeh"><span class="fn n notranslate"><span class="given-name">Rod</span> <span class="family-name">Albuyeh</span></span></a>
</h3><h4 class="title">Lecturer</h4>
<div class="cn-excerpt" id="cn-excerpt-2296272d556adb37">

			Albuyeh is Principal Data Scientist at Figure and part-time lecturer at the Halıcıoğlu Data Science Institute at UC San Diego.  He received his Ph.D. in Political Science at USC in 2016.  His specialties lie in anomaly detection for tabular time-series data and machine learning systems--with applications in marketing, fraud, and credit risk.  He is also interested in applying enterprise machine learning approaches to solve problems in the social sciences.

Research Interests: Machine Learning, Deployment, Scalable Systems<span class="cn-link-more"><a href="https://datascience.ucsd.edu/about/faculty/faculty/name/rod-albuyeh/">Learn More</a></span>
</div> <!--end cn-excerpt-->
<div class="cn-detail cn-hide" id="cn-detail-2296272d556adb37">
<div class="cn-bio" id="cn-bio-2296272d556adb37"><div class="cn-biography"><p>Albuyeh is Principal Data Scientist at Figure and part-time lecturer at the Halıcıoğlu Data Science Institute at UC San Diego.  He received his Ph.D. in Political Science at USC in 2016.  His specialties lie in anomaly detection for tabular time-series data and machine learning systems–with applications in marketing, fraud, and credit risk.  He is also interested in applying enterprise machine learning approaches to solve problems in the social sciences.</p>
<p>Research Interests: Machine Learning, Deployment, Scalable Systems</p>
</div>
</div>
</div> <!--end cn-detail-->
</div> <!--end cn-right-->
</div> <!-- end cn-entry -->
</div>


divs[0].find('a')

<a href="https://datascience.ucsd.edu/about/faculty/faculty/name/rod-albuyeh/" title="Rod Albuyeh"><span class="fn n notranslate"><span class="given-name">Rod</span> <span class="family-name">Albuyeh</span></span></a>


divs[0].find('a').get('title')

'Rod Albuyeh'


divs[0].find('h4')

<h4 class="title">Lecturer</h4>


divs[0].find('h4').text

'Lecturer'


divs[0].find('div', attrs={'class': 'cn-bio'})

<div class="cn-bio" id="cn-bio-2296272d556adb37"><div class="cn-biography"><p>Albuyeh is Principal Data Scientist at Figure and part-time lecturer at the Halıcıoğlu Data Science Institute at UC San Diego.  He received his Ph.D. in Political Science at USC in 2016.  His specialties lie in anomaly detection for tabular time-series data and machine learning systems–with applications in marketing, fraud, and credit risk.  He is also interested in applying enterprise machine learning approaches to solve problems in the social sciences.</p>
<p>Research Interests: Machine Learning, Deployment, Scalable Systems</p>
</div>
</div>


divs[0].find('div', attrs={'class': 'cn-bio'}).text.strip()

'Albuyeh is Principal Data Scientist at Figure and part-time lecturer at the Halıcıoğlu Data Science Institute at UC San Diego.\xa0 He received his Ph.D. in Political Science at USC in 2016.\xa0 His specialties lie in anomaly detection for tabular time-series data and machine learning systems–with applications in marketing, fraud, and credit risk.\xa0 He is also interested in applying enterprise machine\xa0learning approaches to solve problems in the social sciences.\nResearch Interests: Machine Learning, Deployment, Scalable Systems'


names = [div.find('a').get('title') for div in divs]
names[:5]

['Rod Albuyeh',
 'Ilkay Altintas',
 'Mikio Aoi',
 'Ery Arias-Castro',
 'Vineet Bafna']


titles = [div.find('h4').text if div.find('h4') else '' for div in divs]


bios = [div.find('div', attrs={'class': 'cn-bio'}).text.strip() for div in divs]


faculty = pd.DataFrame().assign(name=names, title=titles, bio=bios)
faculty.head()


faculty[faculty['title'] == 'Lecturer']


divs[0].find('img')

<img alt="Photo of Rod Albuyeh" class="cn-image photo" height="215" lazyload="1" loading="lazy" sizes="100vw" srcset="//datascience.ucsd.edu/wp-content/uploads/connections-images/rod-albuyeh/Rod-Albuyeh-Web-07dd8c651b197a11107f1c858ce1e390.jpg 1x" title="Photo of Rod Albuyeh" width="215"/>


divs[0].find('img').get('srcset')

'//datascience.ucsd.edu/wp-content/uploads/connections-images/rod-albuyeh/Rod-Albuyeh-Web-07dd8c651b197a11107f1c858ce1e390.jpg 1x'


def show_picture(name):
    idx = names.index(name)
    url = divs[idx].find('img').get('srcset')
    url = 'https://' + url.strip('/').strip(' 1x')
    display(Image(url))


show_picture('Suraj Rampure')


f'2 + 3 = {2 + 3}'

'2 + 3 = 5'


def make_greeting(name):
    return f"Hi {name}! 👋 Your name has {len(name)} characters, the first of which is {name[0]}."


make_greeting('Billy')

'Hi Billy! 👋 Your name has 5 characters, the first of which is B.'


def download_page(i):
    url = f'https://quotes.toscrape.com/page/{i}'
    request = requests.get(url)
    return bs4.BeautifulSoup(request.text)


soup = download_page(5)


divs = soup.find_all('div', attrs={'class': 'quote'})


divs[0]

<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“A reader lives a thousand lives before he dies, said Jojen. The man who never reads lives only one.”</span>
<span>by <small class="author" itemprop="author">George R.R. Martin</small>
<a href="/author/George-R-R-Martin">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="read,readers,reading,reading-books" itemprop="keywords"/>
<a class="tag" href="/tag/read/page/1/">read</a>
<a class="tag" href="/tag/readers/page/1/">readers</a>
<a class="tag" href="/tag/reading/page/1/">reading</a>
<a class="tag" href="/tag/reading-books/page/1/">reading-books</a>
</div>
</div>


divs[0].find('span', attrs={'class': 'text'}).text

'“A reader lives a thousand lives before he dies, said Jojen. The man who never reads lives only one.”'


divs[0].find('small', attrs={'class': 'author'}).text

'George R.R. Martin'


divs[0].find('a').get('href')

'/author/George-R-R-Martin'


divs[0].find('meta', attrs={'class': 'keywords'}).get('content')

'read,readers,reading,reading-books'


def process_quote(div):
    quote = div.find('span', attrs={'class': 'text'}).text
    author = div.find('small', attrs={'class': 'author'}).text
    author_url = 'https://quotes.toscrape.com' + div.find('a').get('href')
    tags = div.find('meta', attrs={'class': 'keywords'}).get('content')
    
    return pd.Series({'quote': quote, 'author': author, 'author_url': author_url, 'tags': tags})


process_quote(divs[3])

quote         “If you can make a woman laugh, you can make h...
author                                           Marilyn Monroe
author_url    https://quotes.toscrape.com/author/Marilyn-Monroe
tags                                                 girls,love
dtype: object


def process_page(divs):
    return pd.DataFrame([process_quote(div) for div in divs])


process_page(divs)


def quote_df(n):
    '''Returns a DataFrame containing the quotes on the first n pages of https://quotes.toscrape.com/.'''
    dfs = []
    for i in range(1, n + 1):
        # Download page n and create a BeautifulSoup object
        soup = download_page(i)
        
        # Create DataFrame using the information in that page
        divs = soup.find_all('div', attrs={'class': 'quote'})
        df = process_page(divs)
        
        # Append DataFrame to dfs
        dfs.append(df)
        
    # Stitch all DataFrames together
    return pd.concat(dfs).reset_index(drop=True)


first_three_pages = quote_df(3)
first_three_pages.head()


np.unique(first_three_pages['author_url'])

array(['https://quotes.toscrape.com/author/Albert-Einstein',
       'https://quotes.toscrape.com/author/Allen-Saunders',
       'https://quotes.toscrape.com/author/Andre-Gide',
       'https://quotes.toscrape.com/author/Bob-Marley',
       'https://quotes.toscrape.com/author/Douglas-Adams',
       'https://quotes.toscrape.com/author/Dr-Seuss',
       'https://quotes.toscrape.com/author/Eleanor-Roosevelt',
       'https://quotes.toscrape.com/author/Elie-Wiesel',
       'https://quotes.toscrape.com/author/Friedrich-Nietzsche',
       'https://quotes.toscrape.com/author/Garrison-Keillor',
       'https://quotes.toscrape.com/author/J-K-Rowling',
       'https://quotes.toscrape.com/author/Jane-Austen',
       'https://quotes.toscrape.com/author/Jim-Henson',
       'https://quotes.toscrape.com/author/Marilyn-Monroe',
       'https://quotes.toscrape.com/author/Mark-Twain',
       'https://quotes.toscrape.com/author/Mother-Teresa',
       'https://quotes.toscrape.com/author/Pablo-Neruda',
       'https://quotes.toscrape.com/author/Ralph-Waldo-Emerson',
       'https://quotes.toscrape.com/author/Steve-Martin',
       'https://quotes.toscrape.com/author/Thomas-A-Edison'], dtype=object)


einstein = bs4.BeautifulSoup(requests.get('https://quotes.toscrape.com/author/Albert-Einstein').text)


einstein.find('div', attrs={'class': 'author-description'}).text[:1000]

'\n        In 1879, Albert Einstein was born in Ulm, Germany. He completed his Ph.D. at the University of Zurich by 1909. His 1905 paper explaining the photoelectric effect, the basis of electronics, earned him the Nobel Prize in 1921. His first paper on Special Relativity Theory, also published in 1905, changed the world. After the rise of the Nazi party, Einstein made Princeton his permanent home, becoming a U.S. citizen in 1940. Einstein, a pacifist during World War I, stayed a firm proponent of social justice and responsibility. He chaired the Emergency Committee of Atomic Scientists, which organized to alert the public to the dangers of atomic warfare.At a symposium, he advised: "In their struggle for the ethical good, teachers of religion must have the stature to give up the doctrine of a personal God, that is, give up that source of fear and hope which in the past placed such vast power in the hands of priests. In their labors they will have to avail themselves of those forces whi'


f = open(os.path.join('data', 'quotes2scrape.json'))


json.loads(f.readline())

{'auth_url': 'http://quotes.toscrape.com/author/Albert-Einstein',
 'quote_auth': 'Albert Einstein',
 'quote_text': '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
 'bio': 'In 1879, Albert Einstein was born in Ulm, Germany. He completed his Ph.D. at the University of Zurich by 1909. His 1905 paper explaining the photoelectric effect, the basis of electronics, earned him the Nobel Prize in 1921. His first paper on Special Relativity Theory, also published in 1905, changed the world. After the rise of the Nazi party, Einstein made Princeton his permanent home, becoming a U.S. citizen in 1940. Einstein, a pacifist during World War I, stayed a firm proponent of social justice and responsibility. He chaired the Emergency Committee of Atomic Scientists, which organized to alert the public to the dangers of atomic warfare.At a symposium, he advised: "In their struggle for the ethical good, teachers of religion must have the stature to give up the doctrine of a personal God, that is, give up that source of fear and hope which in the past placed such vast power in the hands of priests. In their labors they will have to avail themselves of those forces which are capable of cultivating the Good, the True, and the Beautiful in humanity itself. This is, to be sure a more difficult but an incomparably more worthy task . . . " ("Science, Philosophy and Religion, A Symposium," published by the Conference on Science, Philosophy and Religion in their Relation to the Democratic Way of Life, Inc., New York, 1941). In a letter to philosopher Eric Gutkind, dated Jan. 3, 1954, Einstein stated: "The word god is for me nothing more than the expression and product of human weaknesses, the Bible a collection of honorable, but still primitive legends which are nevertheless pretty childish. No interpretation no matter how subtle can (for me) change this," (The Guardian, "Childish superstition: Einstein\'s letter makes view of religion relatively clear," by James Randerson, May 13, 2008). D. 1955.While best known for his mass–energy equivalence formula E = mc2 (which has been dubbed "the world\'s most famous equation"), he received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect". The latter was pivotal in establishing quantum theory.Einstein thought that Newtonion mechanics was no longer enough to reconcile the laws of classical mechanics with the laws of the electromagnetic field. This led to the development of his special theory of relativity. He realized, however, that the principle of relativity could also be extended to gravitational fields, and with his subsequent theory of gravitation in 1916, he published a paper on the general theory of relativity. He continued to deal with problems of statistical mechanics and quantum theory, which led to his explanations of particle theory and the motion of molecules. He also investigated the thermal properties of light which laid the foundation of the photon theory of light.He was visiting the United States when Adolf Hitler came to power in 1933 and did not go back to Germany. On the eve of World War II, he endorsed a letter to President Franklin D. Roosevelt alerting him to the potential development of "extremely powerful bombs of a new type" and recommending that the U.S. begin similar research. This eventually led to what would become the Manhattan Project. Einstein supported defending the Allied forces, but largely denounced the idea of using the newly discovered nuclear fission as a weapon. Later, with Bertrand Russell, Einstein signed the Russell–Einstein Manifesto, which highlighted the danger of nuclear weapons. Einstein was affiliated with the Institute for Advanced Study in Princeton, New Jersey, until his death in 1955.His great intellectual achievements and originality have made the word "Einstein" synonymous with genius.More: http://en.wikipedia.org/wiki/Albert_E...http://www.nobelprize.org/nobel_prize...',
 'dob': 'March 14, 1879',
 'tags': ['change', 'deep-thoughts', 'thinking', 'world']}


L = [json.loads(x) for x in open(os.path.join('data', 'quotes2scrape.json'))]


df = pd.DataFrame(L)
df.head()


df['tags'].iloc[0]

['change', 'deep-thoughts', 'thinking', 'world']


df.to_csv('out.csv')


df_again = pd.read_csv('out.csv')
df_again.head()


df_again['tags'].iloc[0]

"['change', 'deep-thoughts', 'thinking', 'world']"


distinct_tags = np.unique(df['tags'].sum())
distinct_tags

array(['abilities', 'activism', 'adulthood', 'adventure', 'age',
       'alcohol', 'aliteracy', 'apathy', 'attributed',
       'attributed-no-source', 'authors', 'be-yourself', 'beatles',
       'better-life-empathy', 'bilbo', 'books', 'change', 'children',
       'chocolate', 'choices', 'christianity', 'classic', 'comedy',
       'connection', 'contentment', 'courage', 'death', 'deep-thoughts',
       'difficult', 'dreamers', 'dreaming', 'dreams', 'drug',
       'dumbledore', 'edison', 'education', 'elizabeth-bennet', 'failure',
       'fairy-tales', 'fairytales', 'faith', 'fantasy', 'fate', 'fear',
       'food', 'friends', 'friendship', 'girls', 'god', 'good',
       'growing-up', 'grown-ups', 'happiness', 'hate', 'heartbreak',
       'hope', 'humor', 'imagination', 'indifference', 'insanity',
       'inspiration', 'inspirational', 'integrity', 'jane-austen',
       'journey', 'knowledge', 'lack-of-friendship', 'lack-of-love',
       'learning', 'library', 'lies', 'life', 'literature', 'live',
       'live-death-love', 'lost', 'love', 'lying', 'marriage', 'mind',
       'miracle', 'miracles', 'misattributed-eleanor-roosevelt',
       'misattributed-john-lennon', 'misattributed-mark-twain',
       'misattributed-to-c-s-lewis', 'misattributed-to-einstein',
       'misattributed-to-mother-teresa', 'mistakes', 'music',
       'navigation', 'novelist-quotes', 'obvious', 'open-mind',
       'opposite', 'paraphrased', 'peace', 'philosophy', 'planning',
       'plans', 'poetry', 'quest', 'read', 'readers', 'reading',
       'reading-books', 'regrets', 'religion', 'romance', 'romantic',
       'self-indulgence', 'seuss', 'simile', 'simplicity', 'sinister',
       'sisters', 'success', 'sun', 'tea', 'the-hunger-games', 'thinking',
       'thought', 'travel', 'troubles', 'truth', 'understand',
       'understanding', 'unhappy-marriage', 'value', 'wander', 'wisdom',
       'women', 'world', 'write', 'writers', 'writing', 'yourself'],
      dtype='<U31')


def flatten_tags(taglist):
    return pd.Series({k:1 for k in taglist}, dtype=float)

tags = df['tags'].apply(flatten_tags).fillna(0).astype(int)
tags.head()


df_full = pd.concat([df, tags], axis=1).drop(columns='tags')
df_full.head()


df_full[df_full['inspirational'] == 1].head()


codes = pd.read_csv(os.path.join('data', 'codes.csv'))
programs = pd.read_csv(os.path.join('data', 'programs.csv'))

display(codes)
display(programs)


codes.merge(programs, on='department')


display(codes)
display(programs)


def canonicalize_department(d):
    return (d
           .lower()
           .replace('sci.', 'science')
           .replace('stud.', 'studies')
           .replace('eng.', 'engineering')
           .replace('&', 'and')
           .replace('(', '- ')
           .replace(')', '')
           )


codes['department_clean'] = codes['department'].apply(canonicalize_department)
programs['department_clean'] = programs['department'].apply(canonicalize_department)

display(codes)
display(programs)


codes.merge(programs, on='department_clean')


s = '''132.249.20.188 - - [05/May/2022:14:26:15 -0800] "GET /my/home/ HTTP/1.1" 200 2585'''


full_date = s.split('[')[1].split(']')[0]
full_date

'05/May/2022:14:26:15 -0800'


day, month, rest = full_date.split('/')
day, month, rest

('05', 'May', '2022:14:26:15 -0800')


year, hour, minute, second = rest.split(':')
second = second[:2]
year, hour, minute, second

('2022', '14', '26', '15')


year, day, month, hour, minute, second

('2022', '05', 'May', '14', '26', '15')


pd.to_datetime(full_date[:-6], format='%d/%b/%Y:%H:%M:%S')

Timestamp('2022-05-05 14:26:15')

s

'132.249.20.188 - - [05/May/2022:14:26:15 -0800] "GET /my/home/ HTTP/1.1" 200 2585'


import re
re.findall('\[(\d)+\/(\w+)\/(\d+):(\d+):(\d+):(\d+)\s.*\]', s)[0]

('5', 'May', '2022', '14', '26', '15')

	quote	author	author_url	tags
0	“The world as we have created it is a process ...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	change,deep-thoughts,thinking,world
1	“It is our choices, Harry, that show what we t...	J.K. Rowling	https://quotes.toscrape.com/author/J-K-Rowling	abilities,choices
2	“There are only two ways to live your life. On...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	inspirational,life,live,miracle,miracles
3	“The person, be it gentleman or lady, who has ...	Jane Austen	https://quotes.toscrape.com/author/Jane-Austen	aliteracy,books,classic,humor
4	“Imperfection is beauty, madness is genius and...	Marilyn Monroe	https://quotes.toscrape.com/author/Marilyn-Monroe	be-yourself,inspirational

	auth_url	quote_auth	quote_text	bio	dob	tags
0	http://quotes.toscrape.com/author/Albert-Einstein	Albert Einstein	“The world as we have created it is a process ...	In 1879, Albert Einstein was born in Ulm, Germ...	March 14, 1879	[change, deep-thoughts, thinking, world]
1	http://quotes.toscrape.com/author/J-K-Rowling	J.K. Rowling	“It is our choices, Harry, that show what we t...	See also: Robert GalbraithAlthough she writes ...	July 31, 1965	[abilities, choices]
2	http://quotes.toscrape.com/author/Albert-Einstein	Albert Einstein	“There are only two ways to live your life. On...	In 1879, Albert Einstein was born in Ulm, Germ...	March 14, 1879	[inspirational, life, live, miracle, miracles]
3	http://quotes.toscrape.com/author/Jane-Austen	Jane Austen	“The person, be it gentleman or lady, who has ...	Jane Austen was an English novelist whose work...	December 16, 1775	[aliteracy, books, classic, humor]
4	http://quotes.toscrape.com/author/Marilyn-Monroe	Marilyn Monroe	“Imperfection is beauty, madness is genius and...	Marilyn Monroe (born Norma Jeane Mortenson; Ju...	June 01, 1926	[be-yourself, inspirational]

	Unnamed: 0	auth_url	quote_auth	quote_text	bio	dob	tags
0	0	http://quotes.toscrape.com/author/Albert-Einstein	Albert Einstein	“The world as we have created it is a process ...	In 1879, Albert Einstein was born in Ulm, Germ...	March 14, 1879	['change', 'deep-thoughts', 'thinking', 'world']
1	1	http://quotes.toscrape.com/author/J-K-Rowling	J.K. Rowling	“It is our choices, Harry, that show what we t...	See also: Robert GalbraithAlthough she writes ...	July 31, 1965	['abilities', 'choices']
2	2	http://quotes.toscrape.com/author/Albert-Einstein	Albert Einstein	“There are only two ways to live your life. On...	In 1879, Albert Einstein was born in Ulm, Germ...	March 14, 1879	['inspirational', 'life', 'live', 'miracle', '...
3	3	http://quotes.toscrape.com/author/Jane-Austen	Jane Austen	“The person, be it gentleman or lady, who has ...	Jane Austen was an English novelist whose work...	December 16, 1775	['aliteracy', 'books', 'classic', 'humor']
4	4	http://quotes.toscrape.com/author/Marilyn-Monroe	Marilyn Monroe	“Imperfection is beauty, madness is genius and...	Marilyn Monroe (born Norma Jeane Mortenson; Ju...	June 01, 1926	['be-yourself', 'inspirational']

	department	code
0	Data Sci.	DSC
1	Computer Sci. and Eng.	CSE
2	Biology (Lower Division)	BILD
3	German Stud.	LTGM

	department	programs
0	data science	3
1	computer science & engineering	5
2	biology - lower division	4
3	german studies	2

Lecture 16 – Parsing, Regular Expressions¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Example: Scraping the HDSI Faculty page¶

HDSI Faculty page¶

Example: Scraping quotes¶

Example: Scraping quotes¶

The plan¶

Aside: f-strings in Python¶

Downloading a single page¶

Parsing a single page¶

Putting it all together¶

An extension¶

Key takeaways¶

Nested vs. flat data formats¶

Nested vs. flat data formats¶

Example: Scraping quotes, again¶

One-hot encoding¶

String methods, revisited¶

Transitioning¶

Joining on text¶

String canonicalization¶

Reflection¶

The limitations of string methods¶

Parsing log strings¶

Regular expressions¶

This works...?¶

🤔🤯

Regular expressions¶

regex101.com ¶

Regex building blocks 🧱¶

Example 1¶

Example 2¶

Summary, next time¶

Summary¶

	name	title	bio
0	Rod Albuyeh	Lecturer	Albuyeh is Principal Data Scientist at Figure ...
1	Ilkay Altintas	Chief Data Science Officer, HDSI Founding Facu...	CHIEF DATA SCIENCE OFFICER, SDSC\nIlkay Altint...
2	Mikio Aoi	Assistant Professor	Dr. Aoi is a computational neuroscientist inte...
3	Ery Arias-Castro	Professor	Ery Arias-Castro received his Ph.D. in Statist...
4	Vineet Bafna	Professor	Vineet Bafna, Ph.D., is a Bioinformatics resea...

	quote	author	author_url	tags
0	“A reader lives a thousand lives before he die...	George R.R. Martin	https://quotes.toscrape.com/author/George-R-R-...	read,readers,reading,reading-books
1	“You can never get a cup of tea large enough o...	C.S. Lewis	https://quotes.toscrape.com/author/C-S-Lewis	books,inspirational,reading,tea
2	“You believe lies so you eventually learn to t...	Marilyn Monroe	https://quotes.toscrape.com/author/Marilyn-Monroe
3	“If you can make a woman laugh, you can make h...	Marilyn Monroe	https://quotes.toscrape.com/author/Marilyn-Monroe	girls,love
4	“Life is like riding a bicycle. To keep your b...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	life,simile
5	“The real lover is the man who can thrill you ...	Marilyn Monroe	https://quotes.toscrape.com/author/Marilyn-Monroe	love
6	“A wise girl kisses but doesn't love, listens ...	Marilyn Monroe	https://quotes.toscrape.com/author/Marilyn-Monroe	attributed-no-source
7	“Only in the darkness can you see the stars.”	Martin Luther King Jr.	https://quotes.toscrape.com/author/Martin-Luth...	hope,inspirational
8	“It matters not what someone is born, but what...	J.K. Rowling	https://quotes.toscrape.com/author/J-K-Rowling	dumbledore
9	“Love does not begin and end the way we seem t...	James Baldwin	https://quotes.toscrape.com/author/James-Baldwin	love

	change	deep-thoughts	thinking	world	abilities	choices	inspirational	life	live	miracle	...
0	1	1	1	1	0	0	0	0	0	0	...
1	0	0	0	0	1	1	0	0	0	0	...
2	0	0	0	0	0	0	1	1	1	1	...
3	0	0	0	0	0	0	0	0	0	0	...
4	0	0	0	0	0	0	1	0	0	0	...

operation	order of op.	example	matches ✅	does not match ❌
concatenation	3	`AABAAB`	AABAAB	every other string
or	4	`AA\|BAAB`	AA, BAAB	every other string
closure (zero or more)	2	`AB*A`	AA, ABBBBBBA	AB, ABABA
parentheses	1	`A(A\|B)AAB` `(AB)*A`	AAAAB, ABAAB A, ABABABABA	every other string AA, ABBA

	change	deep-thoughts	thinking	world	abilities	choices	inspirational	life	live	miracle	...
0	1	1	1	1	0	0	0	0	0	0	...
1	0	0	0	0	1	1	0	0	0	0	...
2	0	0	0	0	0	0	1	1	1	1	...
3	0	0	0	0	0	0	0	0	0	0	...
4	0	0	0	0	0	0	1	0	0	0	...

Lecture 16 – Parsing, Regular Expressions¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Example: Scraping the HDSI Faculty page¶

HDSI Faculty page¶

Example: Scraping quotes¶

Example: Scraping quotes¶

The plan¶

Aside: f-strings in Python¶

Downloading a single page¶

Parsing a single page¶

Putting it all together¶

An extension¶

Key takeaways¶

Nested vs. flat data formats¶

Nested vs. flat data formats¶

Example: Scraping quotes, again¶

One-hot encoding¶

String methods, revisited¶

Transitioning¶

Joining on text¶

String canonicalization¶

Reflection¶

The limitations of string methods¶

Parsing log strings¶

Regular expressions¶

This works...?¶

🤔🤯

Regular expressions¶

regex101.com¶

Regex building blocks 🧱¶

Example 1¶

Example 2¶

Summary, next time¶

Summary¶

regex101.com ¶

	change	deep-thoughts	thinking	world	abilities	choices	inspirational	life	live	miracle	...
0	1	1	1	1	0	0	0	0	0	0	...
1	0	0	0	0	1	1	0	0	0	0	...
2	0	0	0	0	0	0	1	1	1	1	...
3	0	0	0	0	0	0	0	0	0	0	...
4	0	0	0	0	0	0	1	0	0	0	...