import pandas as pd
import numpy as np

import requests
from IPython.display import HTML, Image


r = requests.get('https://pokeapi.co/api/v2/pokemon/squirtle')
r

<Response [200]>


r.content[:1000]

b'{"abilities":[{"ability":{"name":"torrent","url":"https://pokeapi.co/api/v2/ability/67/"},"is_hidden":false,"slot":1},{"ability":{"name":"rain-dish","url":"https://pokeapi.co/api/v2/ability/44/"},"is_hidden":true,"slot":3}],"base_experience":63,"forms":[{"name":"squirtle","url":"https://pokeapi.co/api/v2/pokemon-form/7/"}],"game_indices":[{"game_index":177,"version":{"name":"red","url":"https://pokeapi.co/api/v2/version/1/"}},{"game_index":177,"version":{"name":"blue","url":"https://pokeapi.co/api/v2/version/2/"}},{"game_index":177,"version":{"name":"yellow","url":"https://pokeapi.co/api/v2/version/3/"}},{"game_index":7,"version":{"name":"gold","url":"https://pokeapi.co/api/v2/version/4/"}},{"game_index":7,"version":{"name":"silver","url":"https://pokeapi.co/api/v2/version/5/"}},{"game_index":7,"version":{"name":"crystal","url":"https://pokeapi.co/api/v2/version/6/"}},{"game_index":7,"version":{"name":"ruby","url":"https://pokeapi.co/api/v2/version/7/"}},{"game_index":7,"version":{"nam'


r.json()


r = requests.get('https://pokeapi.co/api/v2/pokemon/billy')
r

<Response [404]>


url = 'https://datascience.ucsd.edu/about/faculty/faculty/'
r = requests.get(url)
r

<Response [200]>


urlText = r.text
len(urlText)

1287143


print(urlText[:1000])

<!DOCTYPE html>
<html lang="en-US">
<head>
	<meta charset="UTF-8">
	<link rel="profile" href="https://gmpg.org/xfn/11">
	<style media="all">img.wp-smiley,img.emoji{display:inline !important;border:none !important;box-shadow:none !important;height:1em !important;width:1em !important;margin:0 .07em !important;vertical-align:-.1em !important;background:0 0 !important;padding:0 !important}
.tribe-common{-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;font-smoothing:antialiased}.tribe-common *{box-sizing:border-box}.tribe-common article,.tribe-common aside,.tribe-common details,.tribe-common figcaption,.tribe-common figure,.tribe-common footer,.tribe-common header,.tribe-common main,.tribe-common menu,.tribe-common nav,.tribe-common section,.tribe-common summary{display:block}.tribe-common svg:not(:root){overflow:hidden}.tribe-common audio,.tribe-common canvas,.tribe-common progress,.tribe-common video{display:inline-block}.tribe-common audio:not([controls]){display:non


!cat data/lec15_ex1.html

<html>
	<head>
		<title>Page title</title>
	</head>

	<body>
		<h1>This is a heading</h1>
		<p>This is a paragraph.</p>
		<p>This is another paragraph.</p>
	</body>
</html>


!cat data/lec15_ex2.html

<html>
	<head>
		<title>Project 3 - DSC 80, Spring 2022</title>
	</head>

	<body>
		<h1>Project Overview</h1>
        <img src="../imgs/mountain_lecture.png" width="200" alt="A sunset in the mountains.">
		<p>Start Project 3 by cloning our <a href="https://github.com/dsc-courses/dsc80-2022-sp/">public GitHub repo</a>.
			Note that there is <b>no checkpoint</b> for Project 3!
        </p>

		<center><h3>Make sure to submit your work as a PDF, not a notebook.</h3></center>
	</body>
</html>


!cat data/lec15_ex1.html

<html>
	<head>
		<title>Page title</title>
	</head>

	<body>
		<h1>This is a heading</h1>
		<p>This is a paragraph.</p>
		<p>This is another paragraph.</p>
	</body>
</html>


html_string = '''
<html>
    <body>
      <div id="content">
        <h1>Heading here</h1>
        <p>My First paragraph</p>
        <p>My <em>second</em> paragraph</p>
        <hr>
      </div>
      <div id="nav">
        <ul>
          <li>item 1</li>
          <li>item 2</li>
          <li>item 3</li>
        </ul>
      </div>
    </body>
</html>
'''.strip()


HTML(html_string)


import bs4


bs4.BeautifulSoup?


soup = bs4.BeautifulSoup(html_string)
soup

<html>
<body>
<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>
<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>
</body>
</html>


type(soup)

bs4.BeautifulSoup


print(soup.text)



Heading here
My First paragraph
My second paragraph


item 1
item 2
item 3


soup

<html>
<body>
<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>
<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>
</body>
</html>


soup.children

<list_iterator at 0x7f9372b7d610>


nums = [1, 2, 3, 4]
double = map(lambda x: x ** 2, nums)
double

<map at 0x7f93507e66d0>


next(double)

1


list(double)

[4, 9, 16]


soup

<html>
<body>
<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>
<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>
</body>
</html>


soup.children

<list_iterator at 0x7f9372b7d7c0>


len(list(soup.children))

1


root = next(soup.children)
root

<html>
<body>
<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>
<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>
</body>
</html>


list(root.children)

['\n',
 <body>
 <div id="content">
 <h1>Heading here</h1>
 <p>My First paragraph</p>
 <p>My <em>second</em> paragraph</p>
 <hr/>
 </div>
 <div id="nav">
 <ul>
 <li>item 1</li>
 <li>item 2</li>
 <li>item 3</li>
 </ul>
 </div>
 </body>,
 '\n']


list(list(root.children)[1].children)

['\n',
 <div id="content">
 <h1>Heading here</h1>
 <p>My First paragraph</p>
 <p>My <em>second</em> paragraph</p>
 <hr/>
 </div>,
 '\n',
 <div id="nav">
 <ul>
 <li>item 1</li>
 <li>item 2</li>
 <li>item 3</li>
 </ul>
 </div>,
 '\n']


list(list(list(root.children)[1].children)[3].children)

['\n',
 <ul>
 <li>item 1</li>
 <li>item 2</li>
 <li>item 3</li>
 </ul>,
 '\n']


for child in soup.descendants:
    # print(child) # What would happen if we ran this instead?
    if isinstance(child, str):
        continue
    print(child.name)

html
body
div
h1
p
p
em
hr
div
ul
li
li
li


soup

<html>
<body>
<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>
<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>
</body>
</html>


div = soup.find('div')
div

<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>


soup.find('div', attrs={'id': 'nav'})

<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>


soup.find('ul')

<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>


soup.find('li')

<li>item 1</li>


soup.find_all('div')

[<div id="content">
 <h1>Heading here</h1>
 <p>My First paragraph</p>
 <p>My <em>second</em> paragraph</p>
 <hr/>
 </div>,
 <div id="nav">
 <ul>
 <li>item 1</li>
 <li>item 2</li>
 <li>item 3</li>
 </ul>
 </div>]


soup.find_all('li')

[<li>item 1</li>, <li>item 2</li>, <li>item 3</li>]


[x.text for x in soup.find_all('li')]

['item 1', 'item 2', 'item 3']


soup.find('p')

<p>My First paragraph</p>


soup.find('p').text

'My First paragraph'


soup.find('div')

<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>


soup.find('div').attrs

{'id': 'content'}


soup.find('div').get('id')

'content'


soup

<html>
<body>
<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>
<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>
</body>
</html>


soup.html.div.h1

<h1>Heading here</h1>


soup.html.div.h1.text

'Heading here'


soup.html.div.next_sibling.next_sibling.attrs

{'id': 'nav'}


fac_response = requests.get('https://datascience.ucsd.edu/about/faculty/faculty/')
fac_response

<Response [200]>


soup = bs4.BeautifulSoup(fac_response.text)


divs = soup.find_all('div', attrs={'data-entry-type': 'individual'})


divs[0]

<div class="cn-list-row cn-list-item vcard individual faculty lecturers" data-entry-id="229" data-entry-slug="rod-albuyeh" data-entry-type="individual" id="rod-albuyeh">
<div class="cn-entry cn-accordion" id="entry-id-22962704d3a221a3">
<div class="cn-left" style="min-width: 215px;">
<span class="cn-image-style"><span style="display: block; max-width: 100%; width: 215px"><img alt="Photo of Rod Albuyeh" class="cn-image photo" height="215" lazyload="1" loading="lazy" sizes="100vw" srcset="//datascience.ucsd.edu/wp-content/uploads/connections-images/rod-albuyeh/Rod-Albuyeh-Web-07dd8c651b197a11107f1c858ce1e390.jpg 1x" title="Photo of Rod Albuyeh" width="215"/></span></span>
</div> <!-- end cn-left-->
<div class="cn-right">
<h3 style="border-bottom: #182A48 1px solid; color:#182A48;"><a href="https://datascience.ucsd.edu/about/faculty/faculty/name/rod-albuyeh/" title="Rod Albuyeh"><span class="fn n notranslate"><span class="given-name">Rod</span> <span class="family-name">Albuyeh</span></span></a>
</h3><h4 class="title">Lecturer</h4>
<div class="cn-excerpt" id="cn-excerpt-22962704d3a221a3">

			Albuyeh is Principal Data Scientist at Figure and part-time lecturer at the Halıcıoğlu Data Science Institute at UC San Diego.  He received his Ph.D. in Political Science at USC in 2016.  His specialties lie in anomaly detection for tabular time-series data and machine learning systems--with applications in marketing, fraud, and credit risk.  He is also interested in applying enterprise machine learning approaches to solve problems in the social sciences.

Research Interests: Machine Learning, Deployment, Scalable Systems<span class="cn-link-more"><a href="https://datascience.ucsd.edu/about/faculty/faculty/name/rod-albuyeh/">Learn More</a></span>
</div> <!--end cn-excerpt-->
<div class="cn-detail cn-hide" id="cn-detail-22962704d3a221a3">
<div class="cn-bio" id="cn-bio-22962704d3a221a3"><div class="cn-biography"><p>Albuyeh is Principal Data Scientist at Figure and part-time lecturer at the Halıcıoğlu Data Science Institute at UC San Diego.  He received his Ph.D. in Political Science at USC in 2016.  His specialties lie in anomaly detection for tabular time-series data and machine learning systems–with applications in marketing, fraud, and credit risk.  He is also interested in applying enterprise machine learning approaches to solve problems in the social sciences.</p>
<p>Research Interests: Machine Learning, Deployment, Scalable Systems</p>
</div>
</div>
</div> <!--end cn-detail-->
</div> <!--end cn-right-->
</div> <!-- end cn-entry -->
</div>


divs[0].find('a').get('title')

'Rod Albuyeh'


divs[0].find('h4').text

'Lecturer'


divs[0].find('div', attrs={'class': 'cn-bio'}).text.strip()

'Albuyeh is Principal Data Scientist at Figure and part-time lecturer at the Halıcıoğlu Data Science Institute at UC San Diego.\xa0 He received his Ph.D. in Political Science at USC in 2016.\xa0 His specialties lie in anomaly detection for tabular time-series data and machine learning systems–with applications in marketing, fraud, and credit risk.\xa0 He is also interested in applying enterprise machine\xa0learning approaches to solve problems in the social sciences.\nResearch Interests: Machine Learning, Deployment, Scalable Systems'


names = [div.find('a').get('title') for div in divs]
names[:5]

['Rod Albuyeh',
 'Ilkay Altintas',
 'Mikio Aoi',
 'Ery Arias-Castro',
 'Vineet Bafna']


titles = [div.find('h4').text if div.find('h4') else '' for div in divs]


bios = [div.find('div', attrs={'class': 'cn-bio'}).text.strip() for div in divs]


faculty = pd.DataFrame().assign(name=names, title=titles, bio=bios)
faculty.head()


faculty[faculty['title'] == 'Lecturer']


divs[0].find('img')

<img alt="Photo of Rod Albuyeh" class="cn-image photo" height="215" lazyload="1" loading="lazy" sizes="100vw" srcset="//datascience.ucsd.edu/wp-content/uploads/connections-images/rod-albuyeh/Rod-Albuyeh-Web-07dd8c651b197a11107f1c858ce1e390.jpg 1x" title="Photo of Rod Albuyeh" width="215"/>


def show_picture(name):
    idx = names.index(name)
    url = divs[idx].find('img').get('srcset')
    url = 'https://' + url.strip('/').strip(' 1x')
    display(Image(url))


show_picture('Suraj Rampure')

Element	Description
`<html>`	the document
`<head>`	the header
`<body>`	the body
`<div>`	a logical division of the document
`<span>`	an in-line logical division
`<p>`	a paragraph
`<a>`	an anchor (hyper-link)
`<h1>, <h2>, ...`	header(s)
`<img>`	an image

Lecture 15 – Requests and Parsing HTML¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

APIs and web scraping¶

Programmatic requests¶

APIs¶

API terminology¶

API requests¶

Scraping¶

Accessing HTML¶

Best practices for scraping¶

Consequences of irresponsible scraping¶

The anatomy of HTML documents¶

What is HTML?¶

The anatomy of HTML documents¶

Useful tags to know¶

Example: images and hyperlinks¶

The `<div>` tag¶

Document trees¶

Example: Quote scraping¶

Parsing HTML via Beautiful Soup¶

Beautiful Soup 🍜¶

Example HTML document¶

Heading here

`BeautifulSoup` objects¶

Child nodes¶

Aside: iterators¶

Child nodes¶

Depth-first traversal through `descendants`¶

Finding elements in a tree¶

Using `find`¶

Using `find_all`¶

Node attributes¶

Example: Scraping the HDSI Faculty page¶

Example¶

Summary, next time¶

Summary¶

	name	title	bio
0	Rod Albuyeh	Lecturer	Albuyeh is Principal Data Scientist at Figure ...
1	Ilkay Altintas	Chief Data Science Officer, HDSI Founding Facu...	CHIEF DATA SCIENCE OFFICER, SDSC\nIlkay Altint...
2	Mikio Aoi	Assistant Professor	Dr. Aoi is a computational neuroscientist inte...
3	Ery Arias-Castro	Professor	Ery Arias-Castro received his Ph.D. in Statist...
4	Vineet Bafna	Professor	Vineet Bafna, Ph.D., is a Bioinformatics resea...

Lecture 15 – Requests and Parsing HTML¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

APIs and web scraping¶

Programmatic requests¶

APIs¶

API terminology¶

API requests¶

Scraping¶

Accessing HTML¶

Best practices for scraping¶

Consequences of irresponsible scraping¶

The anatomy of HTML documents¶

What is HTML?¶

The anatomy of HTML documents¶

Useful tags to know¶

Example: images and hyperlinks¶

The <div> tag¶

Document trees¶

Example: Quote scraping¶

Parsing HTML via Beautiful Soup¶

Beautiful Soup 🍜¶

Example HTML document¶

Heading here

BeautifulSoup objects¶

Child nodes¶

Aside: iterators¶

Child nodes¶

Depth-first traversal through descendants¶

Finding elements in a tree¶

Using find¶

Using find_all¶

Node attributes¶

Example: Scraping the HDSI Faculty page¶

Example¶

Summary, next time¶

Summary¶

The `<div>` tag¶

`BeautifulSoup` objects¶

Depth-first traversal through `descendants`¶

Using `find`¶

Using `find_all`¶