from dsc80_utils import *

import requests
r = requests.get('https://datascience.ucsd.edu/faculty/')
r

<Response [200]>

faculty_text = r.text
len(faculty_text)

270510

print(faculty_text[:1000])

<!DOCTYPE html>
<html lang="en-US">
<head>
    <meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0" />
	<meta http-equiv="X-UA-Compatible" content="IE=edge">
	<link rel="profile" href="https://gmpg.org/xfn/11" />
    <title>Faculty &#8211; Halıcıoğlu Data Science Institute &#8211; UC San Diego</title>
					<style type="text/css" id="cst_font_data">
						@font-face {
	font-family: 'Refrigerator Deluxe Extrabold';
	font-weight: 100;
	font-display: auto;
	font-fallback: Arial, Serif;
	src: url('https://datascience.ucsd.edu/wp-content/uploads/2023/03/Refrigerator-Deluxe-Extrabold.otf') format('OpenType');
}
@font-face {
	font-family: 'Refrigerator Deluxe Extrabold';
	font-weight: 100;
	font-display: auto;
	font-fallback: Arial, Serif;
	src: url('https://datascience.ucsd.edu/wp-content/uploads/2023/03/Refrigerator-Deluxe-Extrabold.otf') format('OpenType');
}
@font-face {
	font-family: 'Brix Sans Regular';
	font-weight: 400;
	font-display: auto;
	fon

'Samuel Lau' in faculty_text

False

!cat data/lec10_ex1.html

<html>
  <head>
    <title>Page title</title>
  </head>

  <body>
    <h1>This is a heading</h1>
    <p>This is a paragraph.</p>
    <p>This is <b>another</b> paragraph.</p>
  </body>
</html>

from IPython.display import HTML
from pathlib import Path
HTML(filename=Path('data') / 'lec10_ex1.html')

<img src="king-selfie.png" alt="A photograph of King Triton." width=500>

!cat data/lec10_ex2.html

<html>
  <head>
    <title>Project 3 - DSC 80, Winter 2023</title>
    <link
      href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css"
      rel="stylesheet"
    />
  </head>

  <body>
    <h1>Project Overview</h1>
    <img src="../imgs/platter.png" width="200" alt="My dinner last night." />
    <p>
      Start Project 3 by cloning our
      <a href="https://github.com/dsc-courses/dsc80-2023-fa/"
        >public GitHub repo</a
      >. Note that there is <b>no checkpoint</b> for Project 3!
    </p>

    <center>
      <h3>
        Note that you'll have to submit your notebook as a PDF and a link to
        your website.
      </h3>
    </center>
  </body>
</html>

<div style="background-color:lightblue">
  <h3>This is a heading</h3>
  <p>This is a paragraph.</p>
</div>

html_string = '''
<html>
    <body>
      <div id="content">
        <h1>Heading here</h1>
        <p>My First paragraph</p>
        <p>My <em>second</em> paragraph</p>
        <hr>
      </div>
      <div id="nav">
        <ul>
          <li>item 1</li>
          <li>item 2</li>
          <li>item 3</li>
        </ul>
      </div>
    </body>
</html>
'''.strip()

HTML(html_string)

import bs4

bs4.BeautifulSoup?

soup = bs4.BeautifulSoup(html_string)
soup

<html>
<body>
<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>
<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>
</body>
</html>

type(soup)

bs4.BeautifulSoup

print(soup.text)



Heading here
My First paragraph
My second paragraph


item 1
item 2
item 3

soup.descendants

<generator object Tag.descendants at 0x7fd6c113aba0>

for child in soup.descendants:
#     print(child) # What would happen if we ran this instead?
    if isinstance(child, str):
        continue
    print(child.name)

html
body
div
h1
p
p
em
hr
div
ul
li
li
li

soup

<html>
<body>
<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>
<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>
</body>
</html>

div = soup.find('div')
div

<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>

soup.find('div', attrs={'id': 'nav'})

<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>

soup.find('ul')

<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>

soup.find_all('li')

[<li>item 1</li>, <li>item 2</li>, <li>item 3</li>]

soup.find_all('div')

[<div id="content">
 <h1>Heading here</h1>
 <p>My First paragraph</p>
 <p>My <em>second</em> paragraph</p>
 <hr/>
 </div>,
 <div id="nav">
 <ul>
 <li>item 1</li>
 <li>item 2</li>
 <li>item 3</li>
 </ul>
 </div>]

soup.find_all('li')

[<li>item 1</li>, <li>item 2</li>, <li>item 3</li>]

[x.text for x in soup.find_all('li')]

['item 1', 'item 2', 'item 3']

soup.find('p')

<p>My First paragraph</p>

soup.find('p').text

'My First paragraph'

soup.find('div')

<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>

soup.find('div').attrs

{'id': 'content'}

soup.find('div').get('id')

'content'

soup

<html>
<body>
<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>
<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>
</body>
</html>

# While there are multiple 'id' attributes, none of them are in the <html> tag at the top.
soup.get('id')

soup.find('div').get('id')

'content'

def download_page(i):
    url = f'https://quotes.toscrape.com/page/{i}'
    request = requests.get(url)
    return bs4.BeautifulSoup(request.text)

soup = download_page(1)

divs = soup.find_all('div', class_='quote')
# Shortcut for:
# divs = soup.find_all('div', attrs={'class': 'quote'})

divs[0]

<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="tag" href="/tag/change/page/1/">change</a>
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
<a class="tag" href="/tag/world/page/1/">world</a>
</div>
</div>

divs[0].find('span', class_='text').text

'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'

divs[0].find('small', class_='author').text

'Albert Einstein'

divs[0].find('a').get('href')

'/author/Albert-Einstein'

divs[0].find('meta', class_='keywords').get('content')

'change,deep-thoughts,thinking,world'

def process_quote(div):
    quote = div.find('span', class_='text').text
    author = div.find('small', class_='author').text
    author_url = 'https://quotes.toscrape.com' + div.find('a').get('href')
    tags = div.find('meta', class_='keywords').get('content')
    
    return {'quote': quote, 'author': author, 'author_url': author_url, 'tags': tags}

process_quote(divs[3])

{'quote': '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”',
 'author': 'Jane Austen',
 'author_url': 'https://quotes.toscrape.com/author/Jane-Austen',
 'tags': 'aliteracy,books,classic,humor'}

def process_page(divs):
    return pd.DataFrame([process_quote(div) for div in divs])

process_page(divs)

def make_quote_df(n):
    '''Returns a DataFrame containing the quotes on the first n pages of https://quotes.toscrape.com/.'''
    dfs = []
    for i in range(1, n + 1):
        # Download page n and create a BeautifulSoup object.
        soup = download_page(i)
        
        # Create DataFrame using the information in that page.
        divs = soup.find_all('div', class_='quote')
        df = process_page(divs)
        
        # Append DataFrame to dfs.
        dfs.append(df)
        
    # Stitch all DataFrames together.
    return pd.concat(dfs).reset_index(drop=True)

quotes = make_quote_df(3)
quotes.head()

fac_response = requests.get('https://datascience.ucsd.edu/faculty/')
fac_response

<Response [200]>

soup = bs4.BeautifulSoup(fac_response.text)

divs = soup.find_all(
    'div',
    # Too many!
    # class_='vc_clearfix',
    
    # Too few!
    # class_='vc_grid-term-faculty',
    
    # The right divs
    class_='vc_grid-item',
)

len(divs)

64

divs[0].find('h4').text

'Henry Abarbanel'

divs[0].find(class_='pendari_people_title').text

'Distinguished Professor, HDSI Founding Faculty Member'

names = [div.find('h4').text for div in divs]
names[:10]

['Henry Abarbanel',
 'Ilkay Altintas',
 'Tiffany Amariuta',
 'Mikio Aoi',
 'Ery Arias-Castro',
 'Vineet Bafna',
 'Mikhail Belkin',
 'Jelena Bradic',
 'Henrik Christensen',
 'Alex Cloninger']

titles = [div.find(class_='pendari_people_title').text for div in divs]
titles[:10]

['Distinguished Professor, HDSI Founding Faculty Member',
 'SDSC Chief Data Science Officer & HDSI Founding Faculty Fellow',
 'Assistant Professor',
 'Assistant Professor',
 'Professor',
 'Professor',
 'Professor',
 'Professor',
 'Distinguished Scientist, Professor',
 'Associate Professor']

faculty = pd.DataFrame({
    'name': names, 
    'title': titles, 
})
faculty.head()

faculty[faculty['title'].str.contains('Teaching')]

from IPython.display import Image, display
display(Image(divs[29].find('img')['src']))

quotes.head(2)

tags = quotes['tags'].str.split(',')
tags

0           [change, deep-thoughts, thinking, world]
1                               [abilities, choices]
2     [inspirational, life, live, miracle, miracles]
                           ...                      
27                                                []
28                                     [imagination]
29                                           [music]
Name: tags, Length: 30, dtype: object

def encode(tag_list):
    return pd.Series({k: 1 for k in tag_list})

tags.apply(encode)

quotes_full = pd.concat([quotes, tags.apply(encode)], axis=1).drop(columns='tags')
quotes_full.head()

quotes_full.query('inspirational == 1')

Element	Description
`<html>`	the document
`<head>`	the header
`<body>`	the body
`<div>`	a logical division of the document
`<span>`	an inline logical division
`<p>`	a paragraph
`<a>`	an anchor (hyperlink)
`<h1>, <h2>, ...`	header(s)
`<img>`	an image

	quote	author	author_url	tags
0	“The world as we have created it is a process ...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	change,deep-thoughts,thinking,world
1	“It is our choices, Harry, that show what we t...	J.K. Rowling	https://quotes.toscrape.com/author/J-K-Rowling	abilities,choices
2	“There are only two ways to live your life. On...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	inspirational,life,live,miracle,miracles
...	...	...	...	...
7	“I have not failed. I've just found 10,000 way...	Thomas A. Edison	https://quotes.toscrape.com/author/Thomas-A-Ed...	edison,failure,inspirational,paraphrased
8	“A woman is like a tea bag; you never know how...	Eleanor Roosevelt	https://quotes.toscrape.com/author/Eleanor-Roo...	misattributed-eleanor-roosevelt
9	“A day without sunshine is like, you know, nig...	Steve Martin	https://quotes.toscrape.com/author/Steve-Martin	humor,obvious,simile

	quote	author	author_url	tags
0	“The world as we have created it is a process ...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	change,deep-thoughts,thinking,world
1	“It is our choices, Harry, that show what we t...	J.K. Rowling	https://quotes.toscrape.com/author/J-K-Rowling	abilities,choices
2	“There are only two ways to live your life. On...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	inspirational,life,live,miracle,miracles
3	“The person, be it gentleman or lady, who has ...	Jane Austen	https://quotes.toscrape.com/author/Jane-Austen	aliteracy,books,classic,humor
4	“Imperfection is beauty, madness is genius and...	Marilyn Monroe	https://quotes.toscrape.com/author/Marilyn-Monroe	be-yourself,inspirational

	name	title
13	Justin Eldridge	Assistant Teaching Professor
14	Shannon Ellis	Associate Teaching Professor
29	Sam Lau	Assistant Teaching Professor
31	Soohyun Nam Liao	Assistant Teaching Professor

	change	deep-thoughts	thinking	world	...	fairy-tales		imagination	music
0	1.0	1.0	1.0	1.0	...	NaN	NaN	NaN	NaN
1	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN
2	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...
27	NaN	NaN	NaN	NaN	...	NaN	1.0	NaN	NaN
28	NaN	NaN	NaN	NaN	...	NaN	NaN	1.0	NaN
29	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	1.0

Lecture 10 - Web Scraping¶

DSC 80, Fall 2023¶

📣 Announcements 📣¶

📆 Agenda¶

Review: Accessing HTML¶

Best practices for scraping¶

Consequences of irresponsible scraping¶

Scraping vs. Using an API¶

The anatomy of HTML documents¶

What is HTML?¶

This is a heading

The anatomy of HTML documents¶

Useful tags to know¶

Example: images and hyperlinks¶

The `<div>` tag¶

Document trees¶

Parsing HTML using Beautiful Soup¶

Beautiful Soup 🍜¶

Example HTML document¶

Heading here

`BeautifulSoup` objects¶

Traversing through `descendants`¶

Finding elements in a tree¶

Using `find`¶

Using `find_all`¶

Node attributes¶

Example: Quote scraping¶

The plan¶

Downloading a single page¶

Parsing a single page¶

Putting it all together¶

Example: Scraping the HDSI Faculty page¶

Example¶

One-hot encoding¶

One-hot encoded tags¶

Web Data in Practice¶

Summary, next time¶

Next time¶

	name	title
0	Henry Abarbanel	Distinguished Professor, HDSI Founding Faculty...
1	Ilkay Altintas	SDSC Chief Data Science Officer & HDSI Foundin...
2	Tiffany Amariuta	Assistant Professor
3	Mikio Aoi	Assistant Professor
4	Ery Arias-Castro	Professor

	change	deep-thoughts	thinking	world	...	fairy-tales		imagination	music
0	1.0	1.0	1.0	1.0	...	NaN	NaN	NaN	NaN
1	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN
2	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...
27	NaN	NaN	NaN	NaN	...	NaN	1.0	NaN	NaN
28	NaN	NaN	NaN	NaN	...	NaN	NaN	1.0	NaN
29	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	1.0

	change	deep-thoughts	thinking	world	...	fairy-tales		imagination	music
0	1.0	1.0	1.0	1.0	...	NaN	NaN	NaN	NaN
1	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN
2	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...
27	NaN	NaN	NaN	NaN	...	NaN	1.0	NaN	NaN
28	NaN	NaN	NaN	NaN	...	NaN	NaN	1.0	NaN
29	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	1.0

Lecture 10 - Web Scraping¶

DSC 80, Fall 2023¶

📣 Announcements 📣¶

📆 Agenda¶

Review: Accessing HTML¶

Best practices for scraping¶

Consequences of irresponsible scraping¶

Scraping vs. Using an API¶

The anatomy of HTML documents¶

What is HTML?¶

This is a heading

The anatomy of HTML documents¶

Useful tags to know¶

Example: images and hyperlinks¶

The <div> tag¶

Document trees¶

Parsing HTML using Beautiful Soup¶

Beautiful Soup 🍜¶

Example HTML document¶

Heading here

BeautifulSoup objects¶

Traversing through descendants¶

Finding elements in a tree¶

Using find¶

Using find_all¶

Node attributes¶

Example: Quote scraping¶

The plan¶

Downloading a single page¶

Parsing a single page¶

Putting it all together¶

Example: Scraping the HDSI Faculty page¶

Example¶

One-hot encoding¶

One-hot encoded tags¶

Web Data in Practice¶

Summary, next time¶

Next time¶

The `<div>` tag¶

`BeautifulSoup` objects¶

Traversing through `descendants`¶

Using `find`¶

Using `find_all`¶

	change	deep-thoughts	thinking	world	...	fairy-tales		imagination	music
0	1.0	1.0	1.0	1.0	...	NaN	NaN	NaN	NaN
1	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN
2	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...
27	NaN	NaN	NaN	NaN	...	NaN	1.0	NaN	NaN
28	NaN	NaN	NaN	NaN	...	NaN	NaN	1.0	NaN
29	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	1.0