In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
import statsmodels.formula.api as smf
import requests
from bs4 import BeautifulSoup
In [5]:
def gather_data(urls):
'''
Gathers data from the input URLs.
Args:
urls (string): A list a urls
Returns:
ufo_df (dataframe): A dataframe of UFO datasets.
'''
ufo_df = []
for url in urls:
r = requests.get(url)
soup = BeautifulSoup(r.text)
time=[]
city=[]
state=[]
shape=[]
duration=[]
summary=[]
posted=[]
table=soup.find('tbody')
for row in table.findAll('tr')[0:]:
# Create a variable of all the <td> tag pairs in each <tr> tag pair,
col = row.find_all('td')
column_0 = col[0].string
time.append(column_0)
column_1 = col[1].string
city.append(column_1)
column_2 = col[2].string
state.append(column_2)
column_3 = col[3].string
shape.append(column_3)
column_4 = col[4].string
duration.append(column_4)
column_5 = col[5].string
summary.append(column_5)
column_6 = col[6].string
posted.append(column_6)
data_raw= {'Date/Time':time, 'City':city, 'State': state, 'Shape': shape, 'Duration': duration,
'Summary': summary, 'Posted': posted}
features=['Date/Time', 'City','State', 'Shape', 'Duration','Summary','Posted']
df_time = pd.DataFrame(data_raw, columns=features)
ufo_df.append(df_time)
return ufo_df
In [6]:
urls=['http://www.nuforc.org/webreports/ndxe201512.html',
'http://www.nuforc.org/webreports/ndxe201511.html',
'http://www.nuforc.org/webreports/ndxe201510.html',
'http://www.nuforc.org/webreports/ndxe201509.html',
'http://www.nuforc.org/webreports/ndxe201508.html',
'http://www.nuforc.org/webreports/ndxe201507.html',
'http://www.nuforc.org/webreports/ndxe201506.html',
'http://www.nuforc.org/webreports/ndxe201505.html',
'http://www.nuforc.org/webreports/ndxe201504.html',
'http://www.nuforc.org/webreports/ndxe201503.html',
'http://www.nuforc.org/webreports/ndxe201502.html',
'http://www.nuforc.org/webreports/ndxe201501.html',
'http://www.nuforc.org/webreports/ndxe201412.html',
'http://www.nuforc.org/webreports/ndxe201411.html',
'http://www.nuforc.org/webreports/ndxe201410.html',
'http://www.nuforc.org/webreports/ndxe201409.html',
'http://www.nuforc.org/webreports/ndxe201408.html',
'http://www.nuforc.org/webreports/ndxe201407.html',
'http://www.nuforc.org/webreports/ndxe201406.html',
'http://www.nuforc.org/webreports/ndxe201405.html',
'http://www.nuforc.org/webreports/ndxe201404.html',
'http://www.nuforc.org/webreports/ndxe201403.html',
'http://www.nuforc.org/webreports/ndxe201402.html',
'http://www.nuforc.org/webreports/ndxe201401.html',
]
frame_ufo =gather_data(urls)
In [7]:
df = pd.concat(frame_ufo)
df.to_csv('data.csv',sep='\t', encoding='utf-8', index=False)
#df.to_csv('data.csv')
print df.shape
In [ ]:
In [ ]: