Getting Started

Make sure you have installed all the libraries in the import section below. If you get any errors, look up the documentation for each library for help.


In [ ]:
import geoplot as gplt
import gdelt
import re
import numpy as np
import pandas as pd
import datetime
from tzwhere import tzwhere 
import pytz

tz1 = tzwhere.tzwhere(forceTZ=True)

Setting up gdeltPyR

It's easy to set up gdeltPyR. This single line gets us ready to query. See the github project page for details on accessing other tables and setting other parameters.


In [ ]:
import pickle

marawi = pickle.load(open('/Users/linwood/Documents/marawievents.pkl','rb'))

In [ ]:
gd = gdelt.gdelt()

# %time marawi = gd.Search(['2017 May 23'],normcols=True)

In [ ]:
def striptimen(x):
    """Strip time from numpy array or list of dates that are integers"""
    date = str(int(x))
    n = np.datetime64("{}-{}-{}T{}:{}:{}".format(date[:4],date[4:6],date[6:8],date[8:10],date[10:12],date[12:]))
    return n

def timeget(x):
    '''convert to datetime object with UTC time tag'''
    
    try:
        now_aware = pytz.utc.localize(x[2].to_pydatetime())
    except:
        pass
    
    # get the timezone string representation using lat/lon pair
    try:
        timezone_str=tz1.tzNameAt(x[0],x[1],forceTZ=True)
        
            # get the time offset
        timezone = pytz.timezone(timezone_str)

        # convert UTC to calculated local time
        aware = now_aware.astimezone(timezone)
        return aware
    
    except Exception as e:
        pass

# vectorize our two functions
vect = np.vectorize(striptimen)
vect2=np.vectorize(timeget)

In [ ]:
dates = vect(marawi.dateadded.values)
marawi = marawi.assign(dates=dates)
marawi.set_index(dates,inplace=True)

In [ ]:
datetz = [timeget(l) for l in marawi[['actiongeolat','actiongeolong','dates']][marawi[['actiongeolat','actiongeolong','dates']].notnull()==True].values.tolist()]
marawi=marawi.assign(datezone=datetz)

In [ ]:
maute2= marawi[(marawi.actiongeofeatureid=='-2438515') \
               & (marawi.eventrootcode=='19')]

maute2.sort_values('datezone')[['datezone','sourceurl']].drop_duplicates('sourceurl').head()

In [ ]:
# Alternative to strip all known website domains and add to regex

# endings = pd.read_html('https://iwantmyname.com/domains/domain-name-registration-list-of-extensions')[0]
# endings.columns = ['Domain extension','USD per year','Description']
# endings = endings.assign(doms=endings['Domain extension'].apply(lambda x: ("\\"+ x.split(' ')[0])))
# endingslist = endings['doms'].values.tolist()
# endingslist.append('\.IE')
# endingsString = "|".join(endingslist)

In [ ]:
# known domain regex
# r = re.compile('()({})'.format(endingsString),flags = re.IGNORECASE)

# lazy meta-character regex; more elegant
s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')

In [ ]:
frame = maute2

frame = frame.drop_duplicates(['sourceurl'])

frame=frame.assign(provider=frame.sourceurl.\
      apply(lambda x: s.search(x).group() if s.search(x) else np.nan))

groups = frame.groupby(['provider']).size().sort_values(ascending=False).reset_index()
groups.columns = ['provider','count']
groups

In [ ]:
frame2 = frame.copy()[frame.provider.notnull()==True].drop_duplicates('sourceurl')[['provider','sourceurl','dates']]

In [ ]:
frame2 = frame2.assign(dates=frame2['dates'].apply(lambda x: (x.to_pydatetime().timestamp())))

In [ ]:
grp = frame2.groupby('provider').filter(lambda x: len(x)>=3).groupby('provider')

In [ ]:
final = grp.agg([np.mean,np.max,np.min]).sortlevel('mean',ascending=False)
newfinal = pd.DataFrame(final['dates']['mean'].apply(lambda x:datetime.datetime.fromtimestamp(int(x))).sort_values(ascending=True)).reset_index().set_index('mean',drop=False)
newfinal = newfinal.tz_localize('UTC')
newfinal = newfinal.tz_convert('Asia/Manila')
newfinal.columns = ['provider','UTC Time']
newfinal.index.name='Philippines Time'
# print(newfinal.to_html())

In [ ]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import matplotlib as mpl


mpl.style.use('economist')
timeseries= pd.concat([marawi.resample('H')['sourceurl'].count(),maute2.resample('H')['sourceurl'].count()]
         ,axis=1)
timeseries.fillna(0,inplace=True)
timeseries.columns = ['Total Events','Marawi Violent Events Only']
timeseries = timeseries\
.assign(Normalized=(timeseries['Marawi Violent Events Only']/timeseries['Total Events'])*100)
f,ax = plt.subplots(figsize=(13,7))
ax = timeseries.Normalized\
.ewm(adjust=True,ignore_na=True,min_periods=5,span=12).mean()\
.plot(color="#C10534",label='Exponentially Weighted Count', linewidth=4.8)
ax.set_title('Hourly Count of Violent Events in Marawi',fontsize=28)
for label in ax.get_xticklabels():
      label.set_fontsize(16)
ax.set_xlabel('Hour of the Day', fontsize=20)
ax.set_ylabel('Percentage of Hourly Total',fontsize='15')
ax.legend()
plt.tight_layout()
plt.savefig('../assets/img/countGraphic.png')
# pd.date_range(start='2017 23 May 00:00:00', end='2017 23 May 23:59:59',freq='1H')

In [ ]:
maute2= marawi[(marawi.actiongeofeatureid=='-2438515') \
               & (marawi.eventrootcode=='19')]

In [ ]:
maute2.sourceurl.shape

In [ ]:
marawi.shape

In [ ]:
maute2.drop_duplicates('sourceurl',keep='first').sort_values('dates',ascending=True)['sourceurl'].values.shape

In [ ]:
# Author: Linwood Creekmore
# Email: valinvescap@gmail.com
# Description:  Python script to pull content from a website (works on news stories).

###################################
# Standard Library imports
###################################

import re
from io import BytesIO

###################################
# Third party imports
###################################

import requests
import numpy as np
from bs4 import BeautifulSoup

done ={}
def textgetter(url):
    """Scrapes web news and returns the content
    
    Parameters
    ----------
    
    url : str
        Address to news report
        
    newstext: str
        Returns all text in the "p" tag.  This usually is the content of the news story.
    """
    global done
    
    # regex for url check
    s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')
    
    # check that its an url
    if s.search(url):
        if url in done.keys():
            return done[url]
            pass
        else:

            r  = requests.get(url)
            if r.status_code != 200:
                done[url]="Unable to reach website."
                return {url:"Unable to reach website."}

            data = r.content

            soup = BeautifulSoup(data,'html.parser')

            newstext = " ".join([l.text for l in soup.find_all('p')])
            done[url]=newstext
            del r
            if len(newstext)>200:
                return {url:newstext}
            else:
                newstext = " ".join([l.text for l in soup.find_all('div',class_='field-item even')])
                done[url]=newstext
                if len(newstext)>200:
                    return {url:newstext}
                else:
                    return {url: "No text returned"}
    else:
        return {url:"This is not a proper url."}

In [ ]:
# Author: Linwood Creekmore
# Email: valinvescap@gmail.com
# Description:  Python script to pull content from a website (works on news stories).

###################################
# Standard Library imports
###################################

import re
from io import BytesIO

###################################
# Third party imports
###################################

import requests
import numpy as np
from bs4 import BeautifulSoup


# placehoder to store completed urls; like caching
done ={}
def textgetter(url):
    """Scrapes web news and returns the content
    
    Parameters
    ----------
    
    url : str
        Address to news report
        
    newstext: str
        Returns all text in the "p" tag.  This usually is the content of the news story.
        
    Returns
    ----------
    
    dict: key/value
        Returns dictionary with key = url and value = content/message
    """
    global done
    
    # regex for url check
    s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')
    
    # check that its an url
    if s.search(url):
        if url in done.keys():
            return done[url]
            pass
        else:
            
            # Make the call to the new story
            r  = requests.get(url)
            # check for a good response; return message otherwise
            if r.status_code != 200:
                done[url]="Unable to reach website."
                return {url:"Unable to reach website."}
            # store bytes of message in variable
            data = r.content
            # parse HTML 
            soup = BeautifulSoup(data,'html.parser')
            # strip paragraphs from HTML and join into a string
            newstext = " ".join([l.text for l in soup.find_all('p')])
            # add to done dictionary to prevent duplication
            done[url]=newstext
            # delete the response; save memory
            del r
            # check if return is longer than average sentence
            if len(newstext)>200:
                return {url:newstext}
            else:
                # check for another place where text is stored
                newstext = " ".join([l.text for l in soup.find_all('div',class_='field-item even')])
                done[url]=newstext
                # check for length; must be longer than a sentence
                if len(newstext)>200:
                    return {url:newstext}
                else:
                    # if all fails, return message
                    return {url: "No text returned"}
    else:
        # if we don't pass very first test; not a url
        return {url:"This is not a proper url."}

In [ ]:
urls = frame2['sourceurl'].unique()
textgetter(urls[0])

In [ ]:
d=maute2.sourceurl.drop_duplicates().values

In [ ]:
from concurrent.futures import ProcessPoolExecutor

e = ProcessPoolExecutor()
%time results = list(e.map(textgetter,urls))
# %time results2 = np.array(list(e.map(textgetter,d)))
# done = {}
# %time results3 = marawi.sourceurl.drop_duplicates()[1240:1245].apply(textgetter).values
# %time results3 = marawi.sourceurl.drop_duplicates()[1240:1245].apply(textgetter).values

In [ ]:
sers = []

for l in results:
    ul = list(l.keys())[0]
    content = l[ul]
    sers.append(pd.Series({'url':ul,'content':content}))
connie = pd.concat(sers,axis=1).T

In [ ]:
maute2 = maute2.merge(connie,left_on='sourceurl',right_on='url')

In [ ]:
for l in maute2.drop_duplicates('sourceurl').sort_values('dates',ascending=True).content.values:
    print (l)

In [ ]:
rellie

In [ ]:
rellie = marawi.sourceurl.drop_duplicates()[20:30].apply(textgetter)

In [ ]:
b = pd.concat([d,maute2.sourceurl.drop_duplicates()],axis=1)
b.columns= ['content','url']
b[b.content=="Unable to reach website."]['url'].values

In [ ]:
len(done.keys())

In [ ]:
d

In [ ]:
ur = 'http://www.philstar.com/nation/2017/05/23/1702882/marawi-residents-told-stay-home-firefights-continue'
page = requests.get(ur)
soup = BeautifulSoup(page.content,'html.parser')

In [ ]:
" ".join([l.text for l in soup.find_all('div',class_='field-item even')])

In [ ]:
" ".join([l.text for l in soup.find_all('div',class_='field-item even')])
# soup.find_all('p', class_='outer-text')

In [ ]:
mautesub[['sourceurl','dateadded','datezone']].drop_duplicates('sourceurl').sort_values('datezone',ascending=True).values

In [ ]:
# print(maute2.sort_values('datezone')[['datezone','sourceurl']].drop_duplicates('sourceurl').head().to_html())

In [ ]:
holder = margkg['date'][margkg['date'].notnull()==True].index
margkg=margkg.assign(datefix=margkg['date'])
margkg['datefix'].loc[holder]=vect(margkg['date'].values[holder])
print(margkg['datefix'][margkg['datefix'].notnull()==False])
margkg['datefix']=margkg['datefix'].fillna(method='pad')

In [ ]:
samp = datframe.sample(samplenum)
ax = gplt.polyplot(polyframe,projection=gcrs.PlateCarree(),figsize=(20,12))
gplt.kdeplot(samp,ax=ax,shade=True,shade_lowest=False,projection=gcrs.TransverseMercator())