In [ ]:
import geoplot as gplt
import gdelt
import re
import numpy as np
import pandas as pd
import datetime
from tzwhere import tzwhere
import pytz
tz1 = tzwhere.tzwhere(forceTZ=True)
In [ ]:
import pickle
marawi = pickle.load(open('/Users/linwood/Documents/marawievents.pkl','rb'))
In [ ]:
gd = gdelt.gdelt()
# %time marawi = gd.Search(['2017 May 23'],normcols=True)
In [ ]:
def striptimen(x):
"""Strip time from numpy array or list of dates that are integers"""
date = str(int(x))
n = np.datetime64("{}-{}-{}T{}:{}:{}".format(date[:4],date[4:6],date[6:8],date[8:10],date[10:12],date[12:]))
return n
def timeget(x):
'''convert to datetime object with UTC time tag'''
try:
now_aware = pytz.utc.localize(x[2].to_pydatetime())
except:
pass
# get the timezone string representation using lat/lon pair
try:
timezone_str=tz1.tzNameAt(x[0],x[1],forceTZ=True)
# get the time offset
timezone = pytz.timezone(timezone_str)
# convert UTC to calculated local time
aware = now_aware.astimezone(timezone)
return aware
except Exception as e:
pass
# vectorize our two functions
vect = np.vectorize(striptimen)
vect2=np.vectorize(timeget)
In [ ]:
dates = vect(marawi.dateadded.values)
marawi = marawi.assign(dates=dates)
marawi.set_index(dates,inplace=True)
In [ ]:
datetz = [timeget(l) for l in marawi[['actiongeolat','actiongeolong','dates']][marawi[['actiongeolat','actiongeolong','dates']].notnull()==True].values.tolist()]
marawi=marawi.assign(datezone=datetz)
In [ ]:
maute2= marawi[(marawi.actiongeofeatureid=='-2438515') \
& (marawi.eventrootcode=='19')]
maute2.sort_values('datezone')[['datezone','sourceurl']].drop_duplicates('sourceurl').head()
In [ ]:
# Alternative to strip all known website domains and add to regex
# endings = pd.read_html('https://iwantmyname.com/domains/domain-name-registration-list-of-extensions')[0]
# endings.columns = ['Domain extension','USD per year','Description']
# endings = endings.assign(doms=endings['Domain extension'].apply(lambda x: ("\\"+ x.split(' ')[0])))
# endingslist = endings['doms'].values.tolist()
# endingslist.append('\.IE')
# endingsString = "|".join(endingslist)
In [ ]:
# known domain regex
# r = re.compile('()({})'.format(endingsString),flags = re.IGNORECASE)
# lazy meta-character regex; more elegant
s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')
In [ ]:
frame = maute2
frame = frame.drop_duplicates(['sourceurl'])
frame=frame.assign(provider=frame.sourceurl.\
apply(lambda x: s.search(x).group() if s.search(x) else np.nan))
groups = frame.groupby(['provider']).size().sort_values(ascending=False).reset_index()
groups.columns = ['provider','count']
groups
In [ ]:
frame2 = frame.copy()[frame.provider.notnull()==True].drop_duplicates('sourceurl')[['provider','sourceurl','dates']]
In [ ]:
frame2 = frame2.assign(dates=frame2['dates'].apply(lambda x: (x.to_pydatetime().timestamp())))
In [ ]:
grp = frame2.groupby('provider').filter(lambda x: len(x)>=3).groupby('provider')
In [ ]:
final = grp.agg([np.mean,np.max,np.min]).sortlevel('mean',ascending=False)
newfinal = pd.DataFrame(final['dates']['mean'].apply(lambda x:datetime.datetime.fromtimestamp(int(x))).sort_values(ascending=True)).reset_index().set_index('mean',drop=False)
newfinal = newfinal.tz_localize('UTC')
newfinal = newfinal.tz_convert('Asia/Manila')
newfinal.columns = ['provider','UTC Time']
newfinal.index.name='Philippines Time'
# print(newfinal.to_html())
In [ ]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.style.use('economist')
timeseries= pd.concat([marawi.resample('H')['sourceurl'].count(),maute2.resample('H')['sourceurl'].count()]
,axis=1)
timeseries.fillna(0,inplace=True)
timeseries.columns = ['Total Events','Marawi Violent Events Only']
timeseries = timeseries\
.assign(Normalized=(timeseries['Marawi Violent Events Only']/timeseries['Total Events'])*100)
f,ax = plt.subplots(figsize=(13,7))
ax = timeseries.Normalized\
.ewm(adjust=True,ignore_na=True,min_periods=5,span=12).mean()\
.plot(color="#C10534",label='Exponentially Weighted Count', linewidth=4.8)
ax.set_title('Hourly Count of Violent Events in Marawi',fontsize=28)
for label in ax.get_xticklabels():
label.set_fontsize(16)
ax.set_xlabel('Hour of the Day', fontsize=20)
ax.set_ylabel('Percentage of Hourly Total',fontsize='15')
ax.legend()
plt.tight_layout()
plt.savefig('../assets/img/countGraphic.png')
# pd.date_range(start='2017 23 May 00:00:00', end='2017 23 May 23:59:59',freq='1H')
In [ ]:
maute2= marawi[(marawi.actiongeofeatureid=='-2438515') \
& (marawi.eventrootcode=='19')]
In [ ]:
maute2.sourceurl.shape
In [ ]:
marawi.shape
In [ ]:
maute2.drop_duplicates('sourceurl',keep='first').sort_values('dates',ascending=True)['sourceurl'].values.shape
In [ ]:
# Author: Linwood Creekmore
# Email: valinvescap@gmail.com
# Description: Python script to pull content from a website (works on news stories).
###################################
# Standard Library imports
###################################
import re
from io import BytesIO
###################################
# Third party imports
###################################
import requests
import numpy as np
from bs4 import BeautifulSoup
done ={}
def textgetter(url):
"""Scrapes web news and returns the content
Parameters
----------
url : str
Address to news report
newstext: str
Returns all text in the "p" tag. This usually is the content of the news story.
"""
global done
# regex for url check
s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')
# check that its an url
if s.search(url):
if url in done.keys():
return done[url]
pass
else:
r = requests.get(url)
if r.status_code != 200:
done[url]="Unable to reach website."
return {url:"Unable to reach website."}
data = r.content
soup = BeautifulSoup(data,'html.parser')
newstext = " ".join([l.text for l in soup.find_all('p')])
done[url]=newstext
del r
if len(newstext)>200:
return {url:newstext}
else:
newstext = " ".join([l.text for l in soup.find_all('div',class_='field-item even')])
done[url]=newstext
if len(newstext)>200:
return {url:newstext}
else:
return {url: "No text returned"}
else:
return {url:"This is not a proper url."}
In [ ]:
# Author: Linwood Creekmore
# Email: valinvescap@gmail.com
# Description: Python script to pull content from a website (works on news stories).
###################################
# Standard Library imports
###################################
import re
from io import BytesIO
###################################
# Third party imports
###################################
import requests
import numpy as np
from bs4 import BeautifulSoup
# placehoder to store completed urls; like caching
done ={}
def textgetter(url):
"""Scrapes web news and returns the content
Parameters
----------
url : str
Address to news report
newstext: str
Returns all text in the "p" tag. This usually is the content of the news story.
Returns
----------
dict: key/value
Returns dictionary with key = url and value = content/message
"""
global done
# regex for url check
s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')
# check that its an url
if s.search(url):
if url in done.keys():
return done[url]
pass
else:
# Make the call to the new story
r = requests.get(url)
# check for a good response; return message otherwise
if r.status_code != 200:
done[url]="Unable to reach website."
return {url:"Unable to reach website."}
# store bytes of message in variable
data = r.content
# parse HTML
soup = BeautifulSoup(data,'html.parser')
# strip paragraphs from HTML and join into a string
newstext = " ".join([l.text for l in soup.find_all('p')])
# add to done dictionary to prevent duplication
done[url]=newstext
# delete the response; save memory
del r
# check if return is longer than average sentence
if len(newstext)>200:
return {url:newstext}
else:
# check for another place where text is stored
newstext = " ".join([l.text for l in soup.find_all('div',class_='field-item even')])
done[url]=newstext
# check for length; must be longer than a sentence
if len(newstext)>200:
return {url:newstext}
else:
# if all fails, return message
return {url: "No text returned"}
else:
# if we don't pass very first test; not a url
return {url:"This is not a proper url."}
In [ ]:
urls = frame2['sourceurl'].unique()
textgetter(urls[0])
In [ ]:
d=maute2.sourceurl.drop_duplicates().values
In [ ]:
from concurrent.futures import ProcessPoolExecutor
e = ProcessPoolExecutor()
%time results = list(e.map(textgetter,urls))
# %time results2 = np.array(list(e.map(textgetter,d)))
# done = {}
# %time results3 = marawi.sourceurl.drop_duplicates()[1240:1245].apply(textgetter).values
# %time results3 = marawi.sourceurl.drop_duplicates()[1240:1245].apply(textgetter).values
In [ ]:
sers = []
for l in results:
ul = list(l.keys())[0]
content = l[ul]
sers.append(pd.Series({'url':ul,'content':content}))
connie = pd.concat(sers,axis=1).T
In [ ]:
maute2 = maute2.merge(connie,left_on='sourceurl',right_on='url')
In [ ]:
for l in maute2.drop_duplicates('sourceurl').sort_values('dates',ascending=True).content.values:
print (l)
In [ ]:
rellie
In [ ]:
rellie = marawi.sourceurl.drop_duplicates()[20:30].apply(textgetter)
In [ ]:
b = pd.concat([d,maute2.sourceurl.drop_duplicates()],axis=1)
b.columns= ['content','url']
b[b.content=="Unable to reach website."]['url'].values
In [ ]:
len(done.keys())
In [ ]:
d
In [ ]:
ur = 'http://www.philstar.com/nation/2017/05/23/1702882/marawi-residents-told-stay-home-firefights-continue'
page = requests.get(ur)
soup = BeautifulSoup(page.content,'html.parser')
In [ ]:
" ".join([l.text for l in soup.find_all('div',class_='field-item even')])
In [ ]:
" ".join([l.text for l in soup.find_all('div',class_='field-item even')])
# soup.find_all('p', class_='outer-text')
In [ ]:
mautesub[['sourceurl','dateadded','datezone']].drop_duplicates('sourceurl').sort_values('datezone',ascending=True).values
In [ ]:
# print(maute2.sort_values('datezone')[['datezone','sourceurl']].drop_duplicates('sourceurl').head().to_html())
In [ ]:
holder = margkg['date'][margkg['date'].notnull()==True].index
margkg=margkg.assign(datefix=margkg['date'])
margkg['datefix'].loc[holder]=vect(margkg['date'].values[holder])
print(margkg['datefix'][margkg['datefix'].notnull()==False])
margkg['datefix']=margkg['datefix'].fillna(method='pad')
In [ ]:
samp = datframe.sample(samplenum)
ax = gplt.polyplot(polyframe,projection=gcrs.PlateCarree(),figsize=(20,12))
gplt.kdeplot(samp,ax=ax,shade=True,shade_lowest=False,projection=gcrs.TransverseMercator())