from dsc80_utils import *

import requests

fac_response = requests.get('https://datascience.ucsd.edu/faculty/')
fac_response

<Response [200]>

fac_text = fac_response.text
len(fac_text)

281940

print(fac_text[:1000])

<!DOCTYPE html>
<html dir="ltr" lang="en-US" prefix="og: https://ogp.me/ns#">
<head>
    <meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0" />
	<meta http-equiv="X-UA-Compatible" content="IE=edge">
	<link rel="profile" href="https://gmpg.org/xfn/11" />
    <title>Faculty - Halıcıoğlu Data Science Institute - UC San Diego</title>
                        <script>
                            /* You can add more configuration options to webfontloader by previously defining the WebFontConfig with your options */
                            if ( typeof WebFontConfig === "undefined" ) {
                                WebFontConfig = new Object();
                            }
                            WebFontConfig['google'] = {families: ['Jost:400,700', 'Roboto:400,500']};

                            (function() {
                                var wf = document.createElement( 'script' );
                                wf.src = 'https://ajax.g

!cat data/lec10_ex1.html

<html>
  <head>
    <title>Page title</title>
  </head>

  <body>
    <h1>This is a heading</h1>
    <p>This is a paragraph.</p>
    <p>This is <b>another</b> paragraph.</p>
  </body>
</html>

from IPython.display import HTML
HTML(filename=Path('data') / 'lec10_ex1.html')

<img src="king-selfie.png" alt="A photograph of King Triton." width=500>

Click <a href="https://practice.dsc80.com">this link</a> to access past exams.

!cat data/lec10_ex2.html

<html>
  <head>
    <title>Project 4A and 4B - DSC 80, Spring 2024</title>
    <link
      href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css"
      rel="stylesheet"
    />
  </head>

  <body>
    <h1>Project Overview</h1>
    <img src="../imgs/platter.png" width="200" alt="My dinner last night." />
    <p>
      When the project is released, you can start it by
      <a href="https://github.com/dsc-courses/dsc80-2024-wi/"
        >public GitHub repo</a
      >.
    </p>
    <center>
      <h3>
        Note that you'll have to submit your notebook as a PDF and a link to
        your website.
      </h3>
    </center>
  </body>
</html>

<div style="background-color:lightblue">
  <h3>This is a heading</h3>
  <p>This is a paragraph.</p>
</div>

html_string = '''
<html>
    <body>
      <div id="content">
        <h1>Heading here</h1>
        <p>My First paragraph</p>
        <p>My <em>second</em> paragraph</p>
        <hr>
      </div>
      <div id="nav">
        <ul>
          <li>item 1</li>
          <li>item 2</li>
          <li>item 3</li>
        </ul>
      </div>
    </body>
</html>
'''.strip()

HTML(html_string)

import bs4

bs4.BeautifulSoup?

soup = bs4.BeautifulSoup(html_string)
soup

<html>
<body>
<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>
<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>
</body>
</html>

type(soup)

bs4.BeautifulSoup

print(soup.text)



Heading here
My First paragraph
My second paragraph


item 1
item 2
item 3

soup.descendants

<generator object Tag.descendants at 0x7f9418dd6270>

for child in soup.descendants:
#     print(child) # What would happen if we ran this instead?
    if isinstance(child, str):
        continue
    print(child.name)

html
body
div
h1
p
p
em
hr
div
ul
li
li
li

soup.find('div')

<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>

soup.find('div', attrs={'id': 'nav'})

<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>

# The ul child is not at the top of the tree, but we can still find it.
soup.find('ul')

<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>

soup.find_all('div')

[<div id="content">
 <h1>Heading here</h1>
 <p>My First paragraph</p>
 <p>My <em>second</em> paragraph</p>
 <hr/>
 </div>,
 <div id="nav">
 <ul>
 <li>item 1</li>
 <li>item 2</li>
 <li>item 3</li>
 </ul>
 </div>]

soup.find_all('li')

[<li>item 1</li>, <li>item 2</li>, <li>item 3</li>]

[x.text for x in soup.find_all('li')]

['item 1', 'item 2', 'item 3']

soup.find('p')

<p>My First paragraph</p>

soup.find('p').text

'My First paragraph'

soup.find('div')

<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>

soup.find('div').text

'\nHeading here\nMy First paragraph\nMy second paragraph\n\n'

soup.find('div').attrs

{'id': 'content'}

soup.find('div').get('id')

'content'

soup

<html>
<body>
<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>
<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>
</body>
</html>

# While there are multiple 'id' attributes, none of them are in the <html> tag at the top.
soup.get('id')

soup.find('div').get('id')

'content'

<head>
    <title>3*Canada-2022-06-04</title>
</head>
<body>
    <h1>Spotify Top 3 - Canada</h1>
    <table>
        <tr class='heading'>
            <th>Rank</th>
            <th>Artist(s)</th> 
            <th>Song</th>
        </tr>
        <tr class=1>
            <td>1</td>
            <td>Harry Styles</td> 
            <td>As It Was</td>
        </tr>
        <tr class=2>
            <td>2</td>
            <td>Jack Harlow</td> 
            <td>First Class</td>
        </tr>
        <tr class=3>
            <td>3</td>
            <td>Kendrick Lamar</td> 
            <td>N95</td>
        </tr>
    </table>
</body>

<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="tag" href="/tag/change/page/1/">change</a>
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
<a class="tag" href="/tag/world/page/1/">world</a>
</div>
</div>

<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="tag" href="/tag/change/page/1/">change</a>
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
<a class="tag" href="/tag/world/page/1/">world</a>
</div>
</div>

'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'

'Albert Einstein'

'/author/Albert-Einstein'

'change,deep-thoughts,thinking,world'

{'quote': '“A day without sunshine is like, you know, night.”',
 'author': 'Steve Martin',
 'author_url': 'https://quotes.toscrape.com/author/Steve-Martin',
 'tags': 'humor,obvious,simile'}

<Response [200]>

63

<div class="vc_grid-item vc_clearfix col_1-5 vc_grid-item-zone-c-bottom vc_visible-item vc_grid-term-council vc_grid-term-faculty vc_grid-term-faculty-fellows">
<a class="anchor-link" id="ilkay-altintas" name="ilkay-altintas"></a>
<div class="vc_grid-item-mini vc_clearfix"><div class="vc_gitem-animated-block">
<div class="vc_gitem-zone vc_gitem-zone-a vc-gitem-zone-height-mode-auto vc-gitem-zone-height-mode-auto-1-1" style="background-image: url(https://datascience.ucsd.edu/wp-content/uploads/2022/10/ilkayaltintas_headshot.jpg) !important;"><a class="vc_gitem-link vc-zone-link" href="https://datascience.ucsd.edu/people/ilkay-altintas/"></a><img alt="Ilkay Altintas" class="vc_gitem-zone-img" decoding="async" src="https://datascience.ucsd.edu/wp-content/uploads/2022/10/ilkayaltintas_headshot.jpg"/>
<div class="vc_gitem-zone-mini"></div>
</div>
</div><div class="vc_gitem-zone vc_gitem-zone-c">
<div class="vc_gitem-zone-mini">
<div class="vc_gitem_row vc_row vc_gitem-row-position-top">
<div class="vc_col-sm-12 vc_gitem-col vc_gitem-col-align-">
<div class="vc_custom_heading vc_gitem-post-data vc_gitem-post-data-source-post_title">
<h4 style="text-align: left"><a href="https://datascience.ucsd.edu/people/ilkay-altintas/">Ilkay Altintas</a></h4>
</div>
<div class="vc_gitem-align-left fields">
<div class="field pendari_people_title">SDSC Chief Data Science Officer &amp; HDSI Founding Faculty Fellow</div>
</div>
<div class="excerpt">
</div>
<div class="terms">
                        Council Faculty Faculty Fellows 
                      </div>
</div>
</div>
</div>
</div></div></div>

len(soup.find_all("td"))

soup.find("tr").get("class")

def download_page(i):
    url = f'https://quotes.toscrape.com/page/{i}'
    request = requests.get(url)
    return bs4.BeautifulSoup(request.text)

soup = download_page(1)

divs = soup.find_all('div', class_='quote')
# Shortcut for the following, just for when the attribute key is class:
# divs = soup.find_all('div', attrs={'class': 'quote'})

divs[0]

<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="tag" href="/tag/change/page/1/">change</a>
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
<a class="tag" href="/tag/world/page/1/">world</a>
</div>
</div>

divs[0]

<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="tag" href="/tag/change/page/1/">change</a>
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
<a class="tag" href="/tag/world/page/1/">world</a>
</div>
</div>

# The quote.
divs[0].find('span', class_='text').text

'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'

# The author.
divs[0].find('small', class_='author').text

'Albert Einstein'

# The URL for the author.
divs[0].find('a').get('href')

'/author/Albert-Einstein'

# The quote's tags.
divs[0].find('meta', class_='keywords').get('content')

'change,deep-thoughts,thinking,world'

def process_quote(div):
    quote = div.find('span', class_='text').text
    author = div.find('small', class_='author').text
    author_url = 'https://quotes.toscrape.com' + div.find('a').get('href')
    tags = div.find('meta', class_='keywords').get('content')
    
    return {'quote': quote, 'author': author, 'author_url': author_url, 'tags': tags}

process_quote(divs[-1])

{'quote': '“A day without sunshine is like, you know, night.”',
 'author': 'Steve Martin',
 'author_url': 'https://quotes.toscrape.com/author/Steve-Martin',
 'tags': 'humor,obvious,simile'}

def process_page(divs):
    return pd.DataFrame([process_quote(div) for div in divs])

process_page(divs)

def make_quote_df(n):
    '''Returns a DataFrame containing the quotes on the first n pages of https://quotes.toscrape.com/.'''
    dfs = []
    for i in range(1, n+1):
        # Download page n and create a BeautifulSoup object.
        soup = download_page(i)
        
        # Create DataFrame using the information in that page.
        divs = soup.find_all('div', class_='quote')
        df = process_page(divs)
        
        # Append DataFrame to dfs.
        dfs.append(df)
        
    # Stitch all DataFrames together.
    return pd.concat(dfs).reset_index(drop=True)

quotes = make_quote_df(3)
quotes.head()

quotes[quotes['author'] == 'Albert Einstein']

fac_response = requests.get('https://datascience.ucsd.edu/faculty/')
fac_response

<Response [200]>

soup = bs4.BeautifulSoup(fac_response.text)

divs = soup.find_all(
    class_='vc_grid-item',
)

len(divs)

63

divs[0]

<div class="vc_grid-item vc_clearfix col_1-5 vc_grid-item-zone-c-bottom vc_visible-item vc_grid-term-council vc_grid-term-faculty vc_grid-term-faculty-fellows">
<a class="anchor-link" id="ilkay-altintas" name="ilkay-altintas"></a>
<div class="vc_grid-item-mini vc_clearfix"><div class="vc_gitem-animated-block">
<div class="vc_gitem-zone vc_gitem-zone-a vc-gitem-zone-height-mode-auto vc-gitem-zone-height-mode-auto-1-1" style="background-image: url(https://datascience.ucsd.edu/wp-content/uploads/2022/10/ilkayaltintas_headshot.jpg) !important;"><a class="vc_gitem-link vc-zone-link" href="https://datascience.ucsd.edu/people/ilkay-altintas/"></a><img alt="Ilkay Altintas" class="vc_gitem-zone-img" decoding="async" src="https://datascience.ucsd.edu/wp-content/uploads/2022/10/ilkayaltintas_headshot.jpg"/>
<div class="vc_gitem-zone-mini"></div>
</div>
</div><div class="vc_gitem-zone vc_gitem-zone-c">
<div class="vc_gitem-zone-mini">
<div class="vc_gitem_row vc_row vc_gitem-row-position-top">
<div class="vc_col-sm-12 vc_gitem-col vc_gitem-col-align-">
<div class="vc_custom_heading vc_gitem-post-data vc_gitem-post-data-source-post_title">
<h4 style="text-align: left"><a href="https://datascience.ucsd.edu/people/ilkay-altintas/">Ilkay Altintas</a></h4>
</div>
<div class="vc_gitem-align-left fields">
<div class="field pendari_people_title">SDSC Chief Data Science Officer &amp; HDSI Founding Faculty Fellow</div>
</div>
<div class="excerpt">
</div>
<div class="terms">
                        Council Faculty Faculty Fellows 
                      </div>
</div>
</div>
</div>
</div></div></div>

divs[0].find('h4').text

'Ilkay Altintas'

divs[0].find(class_='field').text

'SDSC Chief Data Science Officer & HDSI Founding Faculty Fellow'

names = [div.find('h4').text for div in divs]
names[:10]

['Ilkay Altintas',
 'Tiffany Amariuta',
 'Mikio Aoi',
 'Ery Arias-Castro',
 'Vineet Bafna',
 'Mikhail Belkin',
 'Jelena Bradic',
 'Henrik Christensen',
 'Alex Cloninger',
 'Anders Dale']

titles = [div.find(class_='field').text for div in divs]
titles[:10]

['SDSC Chief Data Science Officer & HDSI Founding Faculty Fellow',
 'Assistant Professor',
 'Assistant Professor',
 'Professor',
 'Professor',
 'Professor',
 'Professor',
 'Distinguished Scientist, Professor',
 'Associate Professor',
 'Distinguished Scientist, Professor']

faculty = pd.DataFrame({
    'name': names, 
    'title': titles, 
})
faculty.head()

faculty[faculty['title'].str.contains('Teaching') | faculty['title'].str.contains('Lecturer')]

from IPython.display import Image, display

def show_picture(name):
    idx = faculty[faculty['name'].str.lower().str.contains(name.lower())].index[0]
    display(Image(divs[idx].find('img')['src'], width=200, height=200))
    
show_picture('marina')

<head>
    <title>3*Canada-2022-06-04</title>
</head>
<body>
    <h1>Spotify Top 3 - Canada</h1>
    <table>
        <tr class='heading'>
            <th>Rank</th>
            <th>Artist(s)</th> 
            <th>Song</th>
        </tr>
        <tr class=1>
            <td>1</td>
            <td>Harry Styles</td> 
            <td>As It Was</td>
        </tr>
        <tr class=2>
            <td>2</td>
            <td>Jack Harlow</td> 
            <td>First Class</td>
        </tr>
        <tr class=3>
            <td>3</td>
            <td>Kendrick Lamar</td> 
            <td>N95</td>
        </tr>
    </table>
</body>

Element	Description
`<html>`	the document
`<head>`	the header
`<body>`	the body
`<div>`	a logical division of the document
`<span>`	an inline logical division
`<p>`	a paragraph
`<a>`	an anchor (hyperlink)
`<h1>, <h2>, ...`	header(s)
`<img>`	an image

	quote	author	author_url	tags
0	“The world as we have created it is a process ...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	change,deep-thoughts,thinking,world
1	“It is our choices, Harry, that show what we t...	J.K. Rowling	https://quotes.toscrape.com/author/J-K-Rowling	abilities,choices
2	“There are only two ways to live your life. On...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	inspirational,life,live,miracle,miracles
...	...	...	...	...
7	“I have not failed. I've just found 10,000 way...	Thomas A. Edison	https://quotes.toscrape.com/author/Thomas-A-Ed...	edison,failure,inspirational,paraphrased
8	“A woman is like a tea bag; you never know how...	Eleanor Roosevelt	https://quotes.toscrape.com/author/Eleanor-Roo...	misattributed-eleanor-roosevelt
9	“A day without sunshine is like, you know, nig...	Steve Martin	https://quotes.toscrape.com/author/Steve-Martin	humor,obvious,simile

	quote	author	author_url	tags
0	“The world as we have created it is a process ...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	change,deep-thoughts,thinking,world
1	“It is our choices, Harry, that show what we t...	J.K. Rowling	https://quotes.toscrape.com/author/J-K-Rowling	abilities,choices
2	“There are only two ways to live your life. On...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	inspirational,life,live,miracle,miracles
3	“The person, be it gentleman or lady, who has ...	Jane Austen	https://quotes.toscrape.com/author/Jane-Austen	aliteracy,books,classic,humor
4	“Imperfection is beauty, madness is genius and...	Marilyn Monroe	https://quotes.toscrape.com/author/Marilyn-Monroe	be-yourself,inspirational

	quote	author	author_url	tags
0	“The world as we have created it is a process ...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	change,deep-thoughts,thinking,world
2	“There are only two ways to live your life. On...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	inspirational,life,live,miracle,miracles
5	“Try not to become a man of success. Rather be...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	adulthood,success,value
12	“If you can't explain it to a six year old, yo...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	simplicity,understand
26	“If you want your children to be intelligent, ...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	children,fairy-tales
28	“Logic will get you from A to Z; imagination w...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	imagination

	name	title
12	Justin Eldridge	Assistant Teaching Professor
13	Shannon Ellis	Associate Teaching Professor
27	Marina Langlois	Lecturer
...	...	...
39	Suraj Rampure	Lecturer
47	Jack Silberman	Lecturer
51	Janine Tiefenbruck	Lecturer

	name	title
0	Ilkay Altintas	SDSC Chief Data Science Officer & HDSI Foundin...
1	Tiffany Amariuta	Assistant Professor
2	Mikio Aoi	Assistant Professor
3	Ery Arias-Castro	Professor
4	Vineet Bafna	Professor

Lecture 10 – Web Scraping¶

DSC 80, Spring 2024¶

Announcements 📣¶

Your Midterm Scores¶

Agenda 📆¶

Review: Accessing HTML¶

Making requests¶

Best practices for scraping¶

Consequences of irresponsible scraping¶

Summary: APIs vs. scraping¶

The anatomy of HTML documents¶

What is HTML?¶

This is a heading

The anatomy of HTML documents¶

Useful tags to know¶

Example: Images and hyperlinks¶

The <div> tag¶

Document trees¶

Parsing HTML using Beautiful Soup¶

Beautiful Soup 🍜¶

Example HTML document¶

Heading here

BeautifulSoup objects¶

Traversing through descendants¶

Finding elements in a tree¶

Using find¶

Using find_all¶

Node attributes¶

Question 🤔 (Answer at q.dsc80.com)

Example: Scraping quotes¶

Example: Scraping quotes¶

Question 🤔 (Answer at q.dsc80.com)

The plan¶

Downloading a single page¶

Parsing a single page¶

Putting it all together¶

Example: Scraping the HDSI faculty page¶

Example: Scraping the HDSI faculty page¶

Question 🤔 (Answer at q.dsc80.com)

Web data in practice¶

Summary, next time¶

Next time¶

The `<div>` tag¶

`BeautifulSoup` objects¶

Traversing through `descendants`¶

Using `find`¶

Using `find_all`¶