import pandas as pd
import numpy as np
import os

import util

import plotly.express as px
import plotly.figure_factory as ff
pd.options.plotting.backend = 'plotly'


heights = pd.read_csv(os.path.join('data', 'midparent.csv'))
heights = (
    heights
    .rename(columns={'childHeight': 'child', 'childNum': 'number'})
    .drop('midparentHeight', axis=1)
)
heights.head()


np.random.seed(42) # So that we get the same results each time (for lecture).
heights_mcar = util.make_mcar(heights, 'child', pct=0.5)
heights_mar = util.make_mar_on_cat(heights, 'child', 'gender', pct=0.5)


def mean_impute(ser):
    return ser.fillna(ser.mean())

heights_mar_cond = heights_mar.groupby('gender')['child'].transform(mean_impute).to_frame() # Conditional mean imputation (good, since MAR).
heights_mar_mfilled = heights_mar.fillna(heights_mar['child'].mean()) # Single mean imputation (bad, since MAR).

df_map = {'Original': heights, 'MAR, Unfilled': heights_mar, 
          'MAR, Mean Imputed': heights_mar_mfilled, 'MAR, Conditional Mean Imputed': heights_mar_cond}

util.multiple_kdes(df_map)


def create_imputed(col):
    col = col.copy()
    
    # Find the number of missing child heights for that gender.
    num_null = col.isna().sum()
    
    # Sample num_null observed child heights for that gender.
    fill_values = np.random.choice(col.dropna(), num_null)
    
    # Fill in missing values and return ser.
    col[col.isna()] = fill_values
    return col


heights_mar_pfilled = heights_mar.copy()
heights_mar_pfilled['child'] = heights_mar.groupby('gender')['child'].transform(create_imputed)
heights_mar_pfilled['child'].head()

0    73.2
1    69.2
2    62.0
3    62.5
4    73.5
Name: child, dtype: float64


df_map['MAR, Conditionally Probabilistically Imputed'] = heights_mar_pfilled
util.multiple_kdes(df_map)


heights_mcar.head()


create_imputed(heights_mcar['child']).head()

0    73.2
1    69.2
2    60.0
3    63.0
4    73.5
Name: child, dtype: float64


mult_imp = pd.concat([create_imputed(heights_mcar['child']).rename(k) for k in range(100)], axis=1)
mult_imp.head()


# Random sample of 15 imputed columns.
mult_imp_sample = mult_imp.sample(15, axis=1)
fig = ff.create_distplot(mult_imp_sample.to_numpy().T, list(mult_imp_sample.columns), show_hist=False, show_rug=False)
fig.update_xaxes(title='child')


px.histogram(pd.DataFrame(mult_imp.mean()), nbins=15, histnorm='probability',
             title='Distribution of Imputed Sample Means')


# Compare the output to what you see when you go to https://httpbin.org/html in your browser!
!curl -v https://httpbin.org/html

*   Trying 34.199.93.197:443...
* Connected to httpbin.org (34.199.93.197) port 443 (#0)
* ALPN, offering h2
* ALPN, offering http/1.1
* successfully set certificate verify locations:
*  CAfile: /etc/ssl/cert.pem
*  CApath: none
* TLSv1.2 (OUT), TLS handshake, Client hello (1):
* TLSv1.2 (IN), TLS handshake, Server hello (2):
* TLSv1.2 (IN), TLS handshake, Certificate (11):
* TLSv1.2 (IN), TLS handshake, Server key exchange (12):
* TLSv1.2 (IN), TLS handshake, Server finished (14):
* TLSv1.2 (OUT), TLS handshake, Client key exchange (16):
* TLSv1.2 (OUT), TLS change cipher, Change cipher spec (1):
* TLSv1.2 (OUT), TLS handshake, Finished (20):
* TLSv1.2 (IN), TLS change cipher, Change cipher spec (1):
* TLSv1.2 (IN), TLS handshake, Finished (20):
* SSL connection using TLSv1.2 / ECDHE-RSA-AES128-GCM-SHA256
* ALPN, server accepted to use h2
* Server certificate:
*  subject: CN=httpbin.org
*  start date: Mar  1 00:00:00 2023 GMT
*  expire date: Nov 19 23:59:59 2023 GMT
*  subjectAltName: host "httpbin.org" matched cert's "httpbin.org"
*  issuer: C=US; O=Amazon; CN=Amazon RSA 2048 M02
*  SSL certificate verify ok.
* Using HTTP2, server supports multi-use
* Connection state changed (HTTP/2 confirmed)
* Copying HTTP/2 data in stream buffer to connection buffer after upgrade: len=0
* Using Stream ID: 1 (easy handle 0x7fed9c010000)
> GET /html HTTP/2
> Host: httpbin.org
> user-agent: curl/7.77.0
> accept: */*
> 
* Connection state changed (MAX_CONCURRENT_STREAMS == 128)!
< HTTP/2 502 
< server: awselb/2.0
< date: Tue, 09 May 2023 19:44:58 GMT
< content-type: text/html
< content-length: 122
< 
<html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
</body>
</html>
* Connection #0 to host httpbin.org left intact


import requests


res = requests.get('https://ucsd.edu')

res

<Response [200]>


type(res.text)

str


len(res.text)

43477


print(res.text[:1000])

<!DOCTYPE html>
<html lang="en">
  <head>
  
  
    <meta charset="utf-8"/>
    <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
    <meta content="width=device-width, initial-scale=1" name="viewport"/>
    <title>University of California San Diego</title>
    <meta content="University of California, San Diego" name="ORGANIZATION"/>
    <meta content="index,follow,noarchive" name="robots"/>
    <meta content="UCSD" name="SITE"/>
    <meta content="University of California San Diego" name="PAGETITLE"/>
    <meta content="The University California San Diego is one of the world's leading public research universities, located in beautiful La Jolla, California" name="DESCRIPTION"/>
    <link href="favicon.ico" rel="icon"/>


<!-- Site-specific CSS files -->
    
  <link href="https://www.ucsd.edu/_resources/css/vendor/brix_sans.css" rel="stylesheet" type="text/css"/>
  <link href="https://www.ucsd.edu/_resources/css/vendor/refrigerator_deluxe.css" rel="stylesheet"


post_res = requests.post('https://httpbin.org/post',
                         data={'name': 'King Triton'})

post_res

<Response [200]>


post_res.text

'{\n  "args": {}, \n  "data": "", \n  "files": {}, \n  "form": {\n    "name": "King Triton"\n  }, \n  "headers": {\n    "Accept": "*/*", \n    "Accept-Encoding": "gzip, deflate, br", \n    "Content-Length": "16", \n    "Content-Type": "application/x-www-form-urlencoded", \n    "Host": "httpbin.org", \n    "User-Agent": "python-requests/2.28.2", \n    "X-Amzn-Trace-Id": "Root=1-645aa2bb-65eb767a4c861cd4354189aa"\n  }, \n  "json": null, \n  "origin": "99.76.231.122", \n  "url": "https://httpbin.org/post"\n}\n'


# More on this shortly!
post_res.json()

{'args': {},
 'data': '',
 'files': {},
 'form': {'name': 'King Triton'},
 'headers': {'Accept': '*/*',
  'Accept-Encoding': 'gzip, deflate, br',
  'Content-Length': '16',
  'Content-Type': 'application/x-www-form-urlencoded',
  'Host': 'httpbin.org',
  'User-Agent': 'python-requests/2.28.2',
  'X-Amzn-Trace-Id': 'Root=1-645aa2bb-65eb767a4c861cd4354189aa'},
 'json': None,
 'origin': '99.76.231.122',
 'url': 'https://httpbin.org/post'}


yt_res = requests.post('https://youtube.com',
                       data={'name': 'King Triton'})

yt_res

<Response [400]>


from IPython.display import HTML


HTML(yt_res.text)


yt_res.status_code

400


yt_res.status_code, yt_res.ok

(400, False)


post_res.status_code, post_res.ok

(200, True)


import json

f = open(os.path.join('data', 'family.json'), 'r')
family_tree = json.load(f)


family_tree

{'name': 'Grandma',
 'age': 94,
 'children': [{'name': 'Dad',
   'age': 60,
   'children': [{'name': 'Me', 'age': 24}, {'name': 'Brother', 'age': 22}]},
  {'name': 'My Aunt',
   'children': [{'name': 'Cousin 1', 'age': 34},
    {'name': 'Cousin 2',
     'age': 36,
     'children': [{'name': 'Cousin 2 Jr.', 'age': 2}]}]}]}


family_tree['children'][0]['children'][0]['age']

24


x = 4
eval('x + 5')

9


f = open(os.path.join('data', 'family.json'), 'r')
eval(f.read())

{'name': 'Grandma',
 'age': 94,
 'children': [{'name': 'Dad',
   'age': 60,
   'children': [{'name': 'Me', 'age': 24}, {'name': 'Brother', 'age': 22}]},
  {'name': 'My Aunt',
   'children': [{'name': 'Cousin 1', 'age': 34},
    {'name': 'Cousin 2',
     'age': 36,
     'children': [{'name': 'Cousin 2 Jr.', 'age': 2}]}]}]}


f_other = open(os.path.join('data', 'evil_family.json'))
eval(f_other.read())

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [34], in <cell line: 2>()
      1 f_other = open(os.path.join('data', 'evil_family.json'))
----> 2 eval(f_other.read())

File <string>:6, in <module>

File ~/Desktop/TA/dsc80/lectures/sp23/lec14/util.py:173, in err()
    172 def err():
--> 173     raise ValueError('i just deleted all your files lol 😂')

ValueError: i just deleted all your files lol 😂


f_other = open(os.path.join('data', 'evil_family.json'))
s = f_other.read()
s

'{\n    "name": "Grandma",\n    "age": 94,\n    "children": [\n        {\n        "name": util.err(),\n        "age": 60,\n        "children": [{"name": "Me", "age": 24}, \n                     {"name": "Brother", "age": 22}]\n        },\n        {\n        "name": "My Aunt",\n        "children": [{"name": "Cousin 1", "age": 34}, \n                     {"name": "Cousin 2", "age": 36, "children": \n                        [{"name": "Cousin 2 Jr.", "age": 2}]\n                     }\n                    ]\n        }\n    ]\n}'


json.loads(s)

---------------------------------------------------------------------------
JSONDecodeError                           Traceback (most recent call last)
Input In [36], in <cell line: 1>()
----> 1 json.loads(s)

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/json/__init__.py:357, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    352     del kw['encoding']
    354 if (cls is None and object_hook is None and
    355         parse_int is None and parse_float is None and
    356         parse_constant is None and object_pairs_hook is None and not kw):
--> 357     return _default_decoder.decode(s)
    358 if cls is None:
    359     cls = JSONDecoder

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/json/decoder.py:337, in JSONDecoder.decode(self, s, _w)
    332 def decode(self, s, _w=WHITESPACE.match):
    333     """Return the Python representation of ``s`` (a ``str`` instance
    334     containing a JSON document).
    335 
    336     """
--> 337     obj, end = self.raw_decode(s, idx=_w(s, 0).end())
    338     end = _w(s, end).end()
    339     if end != len(s):

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/json/decoder.py:355, in JSONDecoder.raw_decode(self, s, idx)
    353     obj, end = self.scan_once(s, idx)
    354 except StopIteration as err:
--> 355     raise JSONDecodeError("Expecting value", s, err.value) from None
    356 return obj, end

JSONDecodeError: Expecting value: line 6 column 17 (char 84)

	family	father	mother	children	number	gender	child
0	1	78.5	67.0	4	1	male	73.2
1	1	78.5	67.0	4	2	female	69.2
2	1	78.5	67.0	4	3	female	69.0
3	1	78.5	67.0	4	4	female	69.0
4	2	75.5	66.5	4	1	male	73.5

	family	father	mother	children	number	gender	child
0	1	78.5	67.0	4	1	male	73.2
1	1	78.5	67.0	4	2	female	69.2
2	1	78.5	67.0	4	3	female	NaN
3	1	78.5	67.0	4	4	female	NaN
4	2	75.5	66.5	4	1	male	73.5

Type	Description
String	Anything inside double quotes.
Number	Any number (no difference between ints and floats).
Boolean	`true` and `false`.
Null	JSON's empty value, denoted by `null`.
Array	Like Python lists.
Object	A collection of key-value pairs, like dictionaries. Keys must be strings, values can be anything (even other objects).

Lecture 14 – HTTP Basics¶

DSC 80, Spring 2023¶

Agenda¶

Recap: Imputation¶

Example: Heights 🧍📏¶

Mean imputation¶

Conditional mean imputation of MAR data¶

Probabilistic imputation¶

Conditional probabilistic imputation of MAR data¶

Randomness¶

Multiple imputation of MCAR data¶

Multiple imputation of MCAR data¶

Summary of imputation techniques¶

Introduction to HTTP¶

Data sources¶

Collecting data from the internet¶

HTTP¶

The request-response model¶

Request methods¶

Example GET request¶

Example GET response¶

Consequences of the request-response model¶

Example: istheshipstuck.com¶

Making HTTP requests¶

Making HTTP requests¶

Making HTTP requests using curl¶

Example: GET requests via curl¶

Queries in a GET request¶

Making HTTP requests using requests¶

Example: GET requests via requests¶

Example: POST requests via requests¶

Something went wrong

HTTP status codes¶

Successful requests ✅¶

Data formats¶

The data formats of the internet¶

JSON¶

JSON data types¶

Example JSON object¶

Aside: eval¶

eval gone wrong¶

Using the json module¶

Handling unfamiliar data¶

Summary, next time¶

Summary¶

Next time¶

Example `GET` request¶

Example `GET` response¶

Example: istheshipstuck.com ¶

Making HTTP requests using `curl`¶

Example: `GET` requests via `curl`¶

Queries in a `GET` request¶

Making HTTP requests using `requests`¶

Example: `GET` requests via `requests`¶

Example: `POST` requests via `requests`¶

Aside: `eval`¶

`eval` gone wrong¶

Using the `json` module¶