Getting Started

Make sure you have installed all the libraries in the import section below. If you get any errors, look up the documentation for each library for help.



In [ ]:

    
import geoplot as gplt
import gdelt
import re
import numpy as np
import pandas as pd
import datetime
from tzwhere import tzwhere 
import pytz

tz1 = tzwhere.tzwhere(forceTZ=True)

Setting up `gdeltPyR`

It's easy to set up gdeltPyR. This single line gets us ready to query. See the github project page for details on accessing other tables and setting other parameters.



In [ ]:

    
import pickle

marawi = pickle.load(open('/Users/linwood/Documents/marawievents.pkl','rb'))



In [ ]:

    
gd = gdelt.gdelt()

# %time marawi = gd.Search(['2017 May 23'],normcols=True)



In [ ]:

    
def striptimen(x):
    """Strip time from numpy array or list of dates that are integers"""
    date = str(int(x))
    n = np.datetime64("{}-{}-{}T{}:{}:{}".format(date[:4],date[4:6],date[6:8],date[8:10],date[10:12],date[12:]))
    return n

def timeget(x):
    '''convert to datetime object with UTC time tag'''
    
    try:
        now_aware = pytz.utc.localize(x[2].to_pydatetime())
    except:
        pass
    
    # get the timezone string representation using lat/lon pair
    try:
        timezone_str=tz1.tzNameAt(x[0],x[1],forceTZ=True)
        
            # get the time offset
        timezone = pytz.timezone(timezone_str)

        # convert UTC to calculated local time
        aware = now_aware.astimezone(timezone)
        return aware
    
    except Exception as e:
        pass

# vectorize our two functions
vect = np.vectorize(striptimen)
vect2=np.vectorize(timeget)



In [ ]:

    
dates = vect(marawi.dateadded.values)
marawi = marawi.assign(dates=dates)
marawi.set_index(dates,inplace=True)



In [ ]:

    
datetz = [timeget(l) for l in marawi[['actiongeolat','actiongeolong','dates']][marawi[['actiongeolat','actiongeolong','dates']].notnull()==True].values.tolist()]
marawi=marawi.assign(datezone=datetz)



In [ ]:

    
maute2= marawi[(marawi.actiongeofeatureid=='-2438515') \
               & (marawi.eventrootcode=='19')]

maute2.sort_values('datezone')[['datezone','sourceurl']].drop_duplicates('sourceurl').head()



In [ ]:

    
# Alternative to strip all known website domains and add to regex

# endings = pd.read_html('https://iwantmyname.com/domains/domain-name-registration-list-of-extensions')[0]
# endings.columns = ['Domain extension','USD per year','Description']
# endings = endings.assign(doms=endings['Domain extension'].apply(lambda x: ("\\"+ x.split(' ')[0])))
# endingslist = endings['doms'].values.tolist()
# endingslist.append('\.IE')
# endingsString = "|".join(endingslist)



In [ ]:

    
# known domain regex
# r = re.compile('()({})'.format(endingsString),flags = re.IGNORECASE)

# lazy meta-character regex; more elegant
s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')



In [ ]:

    
frame = maute2

frame = frame.drop_duplicates(['sourceurl'])

frame=frame.assign(provider=frame.sourceurl.\
      apply(lambda x: s.search(x).group() if s.search(x) else np.nan))

groups = frame.groupby(['provider']).size().sort_values(ascending=False).reset_index()
groups.columns = ['provider','count']
groups



In [ ]:

    
frame2 = frame.copy()[frame.provider.notnull()==True].drop_duplicates('sourceurl')[['provider','sourceurl','dates']]



In [ ]:

    
frame2 = frame2.assign(dates=frame2['dates'].apply(lambda x: (x.to_pydatetime().timestamp())))



In [ ]:

    
grp = frame2.groupby('provider').filter(lambda x: len(x)>=3).groupby('provider')



In [ ]:

    
final = grp.agg([np.mean,np.max,np.min]).sortlevel('mean',ascending=False)
newfinal = pd.DataFrame(final['dates']['mean'].apply(lambda x:datetime.datetime.fromtimestamp(int(x))).sort_values(ascending=True)).reset_index().set_index('mean',drop=False)
newfinal = newfinal.tz_localize('UTC')
newfinal = newfinal.tz_convert('Asia/Manila')
newfinal.columns = ['provider','UTC Time']
newfinal.index.name='Philippines Time'
# print(newfinal.to_html())



In [ ]:

    
%matplotlib inline
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import matplotlib as mpl


mpl.style.use('economist')
timeseries= pd.concat([marawi.resample('H')['sourceurl'].count(),maute2.resample('H')['sourceurl'].count()]
         ,axis=1)
timeseries.fillna(0,inplace=True)
timeseries.columns = ['Total Events','Marawi Violent Events Only']
timeseries = timeseries\
.assign(Normalized=(timeseries['Marawi Violent Events Only']/timeseries['Total Events'])*100)
f,ax = plt.subplots(figsize=(13,7))
ax = timeseries.Normalized\
.ewm(adjust=True,ignore_na=True,min_periods=5,span=12).mean()\
.plot(color="#C10534",label='Exponentially Weighted Count', linewidth=4.8)
ax.set_title('Hourly Count of Violent Events in Marawi',fontsize=28)
for label in ax.get_xticklabels():
      label.set_fontsize(16)
ax.set_xlabel('Hour of the Day', fontsize=20)
ax.set_ylabel('Percentage of Hourly Total',fontsize='15')
ax.legend()
plt.tight_layout()
plt.savefig('../assets/img/countGraphic.png')
# pd.date_range(start='2017 23 May 00:00:00', end='2017 23 May 23:59:59',freq='1H')



In [ ]:

    
maute2= marawi[(marawi.actiongeofeatureid=='-2438515') \
               & (marawi.eventrootcode=='19')]



In [ ]:

    
maute2.sourceurl.shape



In [ ]:

    
marawi.shape



In [ ]:

    
maute2.drop_duplicates('sourceurl',keep='first').sort_values('dates',ascending=True)['sourceurl'].values.shape



In [ ]:

    
# Author: Linwood Creekmore
# Email: valinvescap@gmail.com
# Description:  Python script to pull content from a website (works on news stories).

###################################
# Standard Library imports
###################################

import re
from io import BytesIO

###################################
# Third party imports
###################################

import requests
import numpy as np
from bs4 import BeautifulSoup

done ={}
def textgetter(url):
    """Scrapes web news and returns the content
    
    Parameters
    ----------
    
    url : str
        Address to news report
        
    newstext: str
        Returns all text in the "p" tag.  This usually is the content of the news story.
    """
    global done
    
    # regex for url check
    s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')
    
    # check that its an url
    if s.search(url):
        if url in done.keys():
            return done[url]
            pass
        else:

            r  = requests.get(url)
            if r.status_code != 200:
                done[url]="Unable to reach website."
                return {url:"Unable to reach website."}

            data = r.content

            soup = BeautifulSoup(data,'html.parser')

            newstext = " ".join([l.text for l in soup.find_all('p')])
            done[url]=newstext
            del r
            if len(newstext)>200:
                return {url:newstext}
            else:
                newstext = " ".join([l.text for l in soup.find_all('div',class_='field-item even')])
                done[url]=newstext
                if len(newstext)>200:
                    return {url:newstext}
                else:
                    return {url: "No text returned"}
    else:
        return {url:"This is not a proper url."}



In [ ]:

    
# Author: Linwood Creekmore
# Email: valinvescap@gmail.com
# Description:  Python script to pull content from a website (works on news stories).

###################################
# Standard Library imports
###################################

import re
from io import BytesIO

###################################
# Third party imports
###################################

import requests
import numpy as np
from bs4 import BeautifulSoup


# placehoder to store completed urls; like caching
done ={}
def textgetter(url):
    """Scrapes web news and returns the content
    
    Parameters
    ----------
    
    url : str
        Address to news report
        
    newstext: str
        Returns all text in the "p" tag.  This usually is the content of the news story.
        
    Returns
    ----------
    
    dict: key/value
        Returns dictionary with key = url and value = content/message
    """
    global done
    
    # regex for url check
    s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')
    
    # check that its an url
    if s.search(url):
        if url in done.keys():
            return done[url]
            pass
        else:
            
            # Make the call to the new story
            r  = requests.get(url)
            # check for a good response; return message otherwise
            if r.status_code != 200:
                done[url]="Unable to reach website."
                return {url:"Unable to reach website."}
            # store bytes of message in variable
            data = r.content
            # parse HTML 
            soup = BeautifulSoup(data,'html.parser')
            # strip paragraphs from HTML and join into a string
            newstext = " ".join([l.text for l in soup.find_all('p')])
            # add to done dictionary to prevent duplication
            done[url]=newstext
            # delete the response; save memory
            del r
            # check if return is longer than average sentence
            if len(newstext)>200:
                return {url:newstext}
            else:
                # check for another place where text is stored
                newstext = " ".join([l.text for l in soup.find_all('div',class_='field-item even')])
                done[url]=newstext
                # check for length; must be longer than a sentence
                if len(newstext)>200:
                    return {url:newstext}
                else:
                    # if all fails, return message
                    return {url: "No text returned"}
    else:
        # if we don't pass very first test; not a url
        return {url:"This is not a proper url."}



In [ ]:

    
urls = frame2['sourceurl'].unique()
textgetter(urls[0])



In [ ]:

    
d=maute2.sourceurl.drop_duplicates().values



In [ ]:

    
from concurrent.futures import ProcessPoolExecutor

e = ProcessPoolExecutor()
%time results = list(e.map(textgetter,urls))
# %time results2 = np.array(list(e.map(textgetter,d)))
# done = {}
# %time results3 = marawi.sourceurl.drop_duplicates()[1240:1245].apply(textgetter).values
# %time results3 = marawi.sourceurl.drop_duplicates()[1240:1245].apply(textgetter).values



In [ ]:

    
sers = []

for l in results:
    ul = list(l.keys())[0]
    content = l[ul]
    sers.append(pd.Series({'url':ul,'content':content}))
connie = pd.concat(sers,axis=1).T



In [ ]:

    
maute2 = maute2.merge(connie,left_on='sourceurl',right_on='url')



In [ ]:

    
for l in maute2.drop_duplicates('sourceurl').sort_values('dates',ascending=True).content.values:
    print (l)



In [ ]:

    
rellie



In [ ]:

    
rellie = marawi.sourceurl.drop_duplicates()[20:30].apply(textgetter)



In [ ]:

    
b = pd.concat([d,maute2.sourceurl.drop_duplicates()],axis=1)
b.columns= ['content','url']
b[b.content=="Unable to reach website."]['url'].values



In [ ]:

    
len(done.keys())



In [ ]:

    
d



In [ ]:

    
ur = 'http://www.philstar.com/nation/2017/05/23/1702882/marawi-residents-told-stay-home-firefights-continue'
page = requests.get(ur)
soup = BeautifulSoup(page.content,'html.parser')



In [ ]:

    
" ".join([l.text for l in soup.find_all('div',class_='field-item even')])



In [ ]:

    
" ".join([l.text for l in soup.find_all('div',class_='field-item even')])
# soup.find_all('p', class_='outer-text')



In [ ]:

    
mautesub[['sourceurl','dateadded','datezone']].drop_duplicates('sourceurl').sort_values('datezone',ascending=True).values



In [ ]:

    
# print(maute2.sort_values('datezone')[['datezone','sourceurl']].drop_duplicates('sourceurl').head().to_html())



In [ ]:

    
holder = margkg['date'][margkg['date'].notnull()==True].index
margkg=margkg.assign(datefix=margkg['date'])
margkg['datefix'].loc[holder]=vect(margkg['date'].values[holder])
print(margkg['datefix'][margkg['datefix'].notnull()==False])
margkg['datefix']=margkg['datefix'].fillna(method='pad')



In [ ]:

    
samp = datframe.sample(samplenum)
ax = gplt.polyplot(polyframe,projection=gcrs.PlateCarree(),figsize=(20,12))
gplt.kdeplot(samp,ax=ax,shade=True,shade_lowest=False,projection=gcrs.TransverseMercator())

Getting Started

Setting up gdeltPyR

Setting up `gdeltPyR`