In [1]:
import pandas as pd
import numpy as np
import pickle
import statsmodels.api as sm
from sklearn import cluster
import matplotlib.pyplot as plt
%matplotlib inline
from bs4 import BeautifulSoup as bs
import requests
import time
# from ggplot import *
The base URL or endpoint is:
http://www.airnowapi.org/aq/observation/zipCode/historical/
? tells us that this is a query.
& separates name, value pairs within the request.
Five name, value pairs POSTED
In [3]:
base_url = "http://www.mywebsite.com/data/api"
attributes = ["key1=value1",
"key2=value2",
"API_KEY=39DC3727-09BD-XXXX-XXXX-XXXXXXXXXXXX"
]
post_url = '&'.join(attributes)
print(base_url+post_url)
In [4]:
base_url = "http://www.airnowapi.org/aq/observation/zipCode/historical/"
attributes = ["format=application/json",
"zipCode=20007",
"date=2017-09-05T00-0000",
"distance=25",
"API_KEY=39DC3727-09BD-48C4-BBD8-XXXXXXXXXXXX"
]
post_url = '&'.join(attributes)
print(base_url+post_url)
In [17]:
ingredients=requests.get(base_url, post_url)
ingredients = ingredients.json()
print(ingredients[0])
In [18]:
for item in ingredients:
AQIType = item['ParameterName']
City=item['ReportingArea']
AQIValue=item['AQI']
print("For Location ", City, " the AQI for ", AQIType, "is ", AQIValue)
In [ ]:
time.sleep(1)
In [7]:
asthma_data = pd.read_csv('asthma-emergency-department-visit-rates-by-zip-code.csv')
asthma_data.head(2)
Out[7]:
In [8]:
asthma_data[['zip','coordinates']] = asthma_data.loc[:,'ZIP code'].str.split(
pat='\n',expand=True)
asthma_data.drop('ZIP code', axis=1,inplace=True)
asthma_data.head(2)
Out[8]:
In [9]:
asthma_unstacked = asthma_data.pivot_table(index = ['Year',
'zip',
'County',
'coordinates',
'County Fips code'],
columns = 'Age Group',
values = 'Number of Visits')
asthma_unstacked.reset_index(drop=False,inplace=True)
asthma_unstacked.head(2)
Out[9]:
In [ ]:
base_url = "http://www.airnowapi.org/aq/observation/zipCode/historical/"
zips = asthma_unstacked.zip.unique()
zips = zips[:450]
date ="date=2015-09-01T00-0000"
api_key = "API_KEY=39DC3727-09BD-48C4-BBD8-XXXXXXXXXXXX"
return_format = "format=application/json"
zip_str = "zipCode="
post_url = "&".join([date,api_key,return_format,zip_str])
data_dict = {}
for zipcode in zips:
time.sleep(1)
zip_post = post_url + str(zipcode)
ingredients = requests.get(base_url, zip_post)
ingredients = ingredients.json()
zip_data = {}
for data_point in ingredients:
AQIType = data_point['ParameterName']
AQIVal = data_point['AQI']
zip_data[AQIType] = AQIVal
data_dict[zipcode]= zip_data
In [24]:
ingredients = requests.get("https://en.wikipedia.org/wiki/Data_science")
soup = bs(ingredients.text)
print(soup.body.p)
In [15]:
parser_div = soup.find("div", class_="mw-parser-output")
wiki_content = parser_div.find_all('p')
print(wiki_content[1])
print('*****************************************')
print(wiki_content[1].text)
In [28]:
parser_div = soup.find("div", id="toc")
wiki_content = parser_div.find_all('ul')
for item in wiki_content:
print(item.text)
In [56]:
wiki_content = soup.find_all('a',href=True)
in_hist = False
links = []
for l in wiki_content:
link = l['href']
if link == '/w/index.php?title=Data_science&action=edit§ion=2':
in_hist = False
if in_hist:
links.append(link)
if link =="/w/index.php?title=Data_science&action=edit§ion=1":
in_hist = True
print(links)
In [20]:
topics = ['Data_scraping','Machine_learning','Statistics','Linear_algebra',
'Cluster_analysis','Scientific_modelling','Analysis','Linear_regression']
base_url = 'https://en.wikipedia.org/wiki/{}'
paragraphs = []
for topic in topics:
url = base_url.format(topic)
ingredients = requests.get("https://en.wikipedia.org/wiki/Data_science")
soup = bs(ingredients.text)
parser_div = soup.find("div", class_="mw-parser-output")
wiki_content = parser_div.find_all('p')
for p in range(10):
if len(wiki_content[p].text)>10:
paragraphs.append(wiki_content[p].text)
break
time.sleep(1)
print(dict(zip(topics,paragraphs)))
In [61]:
pickle.dump(data_dict,open('AQI_data_raw.p','wb'))
In [63]:
collected = list(data_dict.keys())
asthma_2015_sub = asthma_unstacked.loc[(asthma_unstacked.zip.isin(collected))&
(asthma_unstacked.Year == 2015),:]
In [67]:
aqi_data = pd.DataFrame.from_dict(data_dict, orient='index')
aqi_data.reset_index(drop=False,inplace=True)
aqi_data.rename(columns={'index':'zip'},inplace=True)
aqi_data.head()
Out[67]:
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html
In [156]:
asthma_aqi = asthma_2015_sub.merge(aqi_data,how='outer',on='zip')
asthma_aqi.rename(columns = {'Adults (18+)':'Adults',
'All Ages':'Incidents',
'Children (0-17)':'Children'},inplace=True)
asthma_aqi.head(2)
Out[156]:
In [74]:
asthma_aqi.Incidents.plot.hist(20)
Out[74]:
In [75]:
asthma_aqi.loc[:,['Incidents','OZONE']].plot.density()
Out[75]:
In [76]:
asthma_aqi.loc[:,['PM2.5','PM10']].plot.hist()
Out[76]:
In [327]:
asthma_aqi.plot.scatter('OZONE','PM2.5')
Out[327]:
In [77]:
y =asthma_aqi.loc[:,'Incidents']
x =asthma_aqi.loc[:,['OZONE','PM2.5']]
x['c'] = 1
ols_model1 = sm.OLS(y,x,missing='drop')
results = ols_model1.fit()
print(results.summary())
pickle.dump([results,ols_model1],open('ols_model_results.p','wb'))
In [85]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_partregress_grid(results, fig=fig)
In [137]:
ingredients = requests.get('https://www.california-demographics.com/zip_codes_by_population')
soup = bs(ingredients.text)
table = soup.find("table")
population = pd.read_html(str(table),flavor='html5lib')[0]
population.rename(columns=population.iloc[0],inplace=True)
population.drop(index=0,inplace=True)
population.head(2)
Out[137]:
In [138]:
population[['zip','zip2']]=population.loc[:,'Zip Code'].str.split(
pat =' and ',
expand=True)
population.Population = population.Population.astype(np.float)
population.loc[population.zip2!=None,'Population']=population.loc[population.zip2!=None,'Population']/2
temp_pop = population.loc[population.zip!=None,['Population','zip2']].copy()
temp_pop.rename(columns={'zip2':'zip'},inplace=True)
population = pd.concat([population.loc[:,['Population','zip']],
temp_pop],axis=0)
population.head(2)
Out[138]:
In [157]:
asthma_aqi = asthma_aqi.merge(population,how='left',on='zip')
y =asthma_aqi.loc[:,'Adults']
x =asthma_aqi.loc[:,['OZONE','Population']]
x['c'] = 1
glm_model = sm.GLM(y,x,missing='drop',family=sm.families.Poisson())
ols_model2 = sm.OLS(y,x,missing='drop')
glm_results = glm_model.fit()
results = ols_model2.fit()
print(glm_results.summary())
pickle.dump([glm_results,glm_model],open('glm_model_pop_results.p','wb'))
In [158]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_partregress_grid(results, fig=fig)
In [159]:
fig, ax = plt.subplots(figsize=(12,8))
fig = sm.graphics.influence_plot(results, ax=ax, criterion="cooks")
In [161]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(results, "OZONE", fig=fig)
In [325]:
model_df = asthma_aqi.loc[:,['OZONE','PM2.5','Incidents',]]
model_df.dropna(axis=0,inplace=True)
model_df = (model_df - model_df.mean()) / (model_df.max() - model_df.min())
In [ ]:
asthma_air_clusters=cluster.KMeans(n_clusters = 3)
asthma_air_clusters.fit(model_df)
model_df['clusters3']=asthma_air_clusters.labels_
In [326]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(4, 3))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
labels = asthma_air_clusters.labels_
ax.scatter(model_df.loc[:, 'PM2.5'], model_df.loc[:, 'OZONE'], model_df.loc[:, 'Incidents'],
c=labels.astype(np.float), edgecolor='k')
ax.set_xlabel('Particulates')
ax.set_ylabel('Ozone')
ax.set_zlabel('Incidents')
Out[326]: