GDELT 1.0 Code (skip)

Will have to integrate this into GDELT 2.0. Headers are different. GDELT 1.0 goes back to 1979. 2.0 only goes back to Feb 2015


In [1]:
from IPython.display import Image
import pandas as pd
import numpy as no
Image(url='../utils/images/spinningglobe.gif')


Out[1]:

In [2]:
import requests
import lxml.html as lh

gdelt_base_url = 'http://data.gdeltproject.org/events/'
gdelt_gkg_url = 'http://api.gdeltproject.org/api/v1/gkg_geojson'
# get the list of all the links on the gdelt file page
page = requests.get(gdelt_base_url+'index.html')
doc = lh.fromstring(page.content)
link_list = doc.xpath("//*/ul/li/a/@href")

# separate out those links that begin with four digits 
file_list = [x for x in link_list if str.isdigit(x[0:4])]

In [83]:
# pd.read_excel('./CSV.header.fieldids.xlsx').columns.tolist()

In [3]:
masterListUrl = 'http://data.gdeltproject.org/gdeltv2/masterfilelist.txt'
directory = requests.get(masterListUrl)
results = directory.content.split('\n')


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-3-0842bf457047> in <module>()
      1 masterListUrl = 'http://data.gdeltproject.org/gdeltv2/masterfilelist.txt'
      2 directory = requests.get(masterListUrl)
----> 3 results = directory.content.split('\n')

TypeError: a bytes-like object is required, not 'str'

In [85]:
results;

In [86]:
# import pandas as pd

# pd.options.display.max_rows = 200
# df = pd.DataFrame(data.json())

# df['coords'] = df.features.apply(lambda row: row['geometry']['coordinates'])
# df['lat'] = df.features.apply(lambda row: row['geometry']['coordinates'][1])
# df['lon'] = df.features.apply(lambda row: row['geometry']['coordinates'][0])
# df['name'] = df.features.apply(lambda row: row['properties']['name'])
# df['pubdate'] = df.features.apply(lambda row: row['properties']['urlpubtimedate'])
# df['urltone'] = df.features.apply(lambda row: row['properties']['urltone'])
# df['mentionedNames'] = df.features.apply(lambda row: row['properties']['mentionednames'])
# df['mentioinedThemes'] = df.features.apply(lambda row: row['properties']['mentionedthemes'])
# df['url'] = df.features.apply(lambda row: row['properties']['url'])

GDELT 2.0 Access


In [87]:
import requests
import pandas as pd
import numpy as np
import re
from dateutil.parser import parse

Logic for GDELT module

Enter a date or date range. GDELT 2.0 only goes to Feb 18 2015. GDELT 1.0 goes back to 1979.

Convert the entered date or date range to string, search for string in the master df list. Use the tblType parameter to pull the correct table(s).

  • default is take current time and most recent file
  • enter historical date; defaults to last record of day
    • parse
    • add feature to enter time for historical and pull closest 15 minute file
    • date range will pull last file for each day and concatenate into single dataframe

choose a database

  • Select between events, event mentions or gkg

return it as a python or R dataframe

  • use the feather library for Python


In [88]:
from dateutil.parser import parse
import traceback,sys
import pandas as pd
import numpy as np
import datetime
import requests


class gdeltSearch(object):
    """Placeholder string"""
    
    def __init__(self,
                 gdelt2MasterUrl = 'http://data.gdeltproject.org/gdeltv2/masterfilelist.txt',
                 gdelt1MasterUrl = 'http://data.gdeltproject.org/events/index.html',
                 tblType = None,
                 headers = None,
                 masterdf = None,
                 clean = None,
                 queryTime = datetime.datetime.now().strftime('%m-%d-%Y %H:%M:%S')
                 ):
        self.gdelt2MasterUrl=gdelt2MasterUrl
        self.gdelt1MasterUrl=gdelt1MasterUrl
        self.clean = map(
                    lambda x: x.split(' '),
                    requests.get(self.gdelt2MasterUrl).content.split('\n')
                        )
        del self.clean[-1]
        self.masterdf = pd.DataFrame(self.clean)
        self.masterdf.fillna('', inplace=True)
        self.queryTime=queryTime

In [89]:
import traceback,sys
import datetime
from dateutil.parser import parse
import numpy as np
import requests
import pandas as pd


class gdeltSearch(object):
    """Placeholder string"""
    
    def __init__(self,
                 gdelt2MasterUrl='http://data.gdeltproject.org/gdeltv2/masterfilelist.txt',
                 gdelt1MasterUrl='http://data.gdeltproject.org/events/index.html',
                 tblType=None,
                 headers=None,
                 masterdf=None,
                 clean = None,
                 ):
        self.gdelt2MasterUrl=gdelt2MasterUrl
        self.gdelt1MasterUrl=gdelt1MasterUrl
        self.clean = map(lambda x: x.split(' '),requests.get(self.gdelt2MasterUrl).content.split('\n'))
        del self.clean[-1]
        self.masterdf = pd.DataFrame(self.clean)
        self.masterdf.fillna('',inplace=True)
        
        
        
    @staticmethod
    def dateInputCheck(date):
        """Function to check date entered by user.

        Example

        Parameters
            ----------
            date : {string or list}, 
                Input data, where ``date`` is a single date string, 
                two dates representing a range, or several dates \
                that represent individual days of interest.
        Returns
        -------
        self : None
            Returns self.
        """

        if isinstance(date,str):
            if date != "":
                if parse(date) > datetime.datetime.now():
                    raise ValueError('Your date is greater than the current date.\
                    Please enter a relevant date.')
                elif parse(date)<parse('Feb 18 2015'):
                    raise ValueError('GDELT 2.0 only supports \'Feb 18 2015 - Present\'\
                    queries currently. Try another date.')

        elif isinstance(date,list):
            if len(date)==1:
                try:
                    if parse("".join(date)) > datetime.datetime.now():
                        raise ValueError('Your date is greater than the current\
                        date.  Please enter a relevant date.')
                    elif parse("".join(date)) < parse('Feb 18 2015'):
                        raise ValueError('GDELT 2.0 only supports \'Feb 18 2015 - Present\' \
                        queries currently. Try another date.')
                except:
                    exc_type, exc_value, exc_traceback = sys.exc_info()
                    traceback.print_tb(exc_traceback, limit=1, file=sys.stdout)
                    traceback.print_exception(exc_type, exc_value, exc_traceback,
                                              limit=2, file=sys.stdout)
                    raise ValueError("One or more of your input date strings does \
                    not parse to a date format. Check input.")


            elif len(date)==2:
                try:
                    map(parse,date)
                except Exception as exc:
                    exc_type, exc_value, exc_traceback = sys.exc_info()
                    traceback.print_tb(exc_traceback, limit=1, file=sys.stdout)
                    traceback.print_exception(exc_type, exc_value, exc_traceback,
                                              limit=2, file=sys.stdout)
                    raise ValueError("One or more of your input date strings \
                    does not parse to a date format. Check input.")

                if bool(parse(date[0])<parse(date[1])) == False:
                    raise ValueError('Start date greater than end date. Check date \
                    strings.')

                if np.all(
                        np.logical_not(np.array(map(parse,date))> datetime.datetime.now())
                        ) == False:
                    raise ValueError("One of your dates is greater than the current \
                    date. Check input date strings.")


            elif len(date)>2:

                try:
                    map(parse,date)
                except Exception as exc:
                    exc_type, exc_value, exc_traceback = sys.exc_info()
                    traceback.print_tb(exc_traceback, limit=1, file=sys.stdout)
                    traceback.print_exception(exc_type, exc_value, exc_traceback,
                                              limit=2, file=sys.stdout)
                    raise ValueError("One or more of your input date strings does \
                    not parse to a date format. Check input.")

                if np.all(
                        np.logical_not(np.array(map(parse,date))> datetime.datetime.now())
                        ) == False:
                    raise ValueError("One or more of your input date strings does not \
                    parse to a date format. Check input.")
                    
            self.date=date
            
    @staticmethod        
    def parse_date(var):
        """Return datetime object from string."""

        try:
            return np.where(isinstance(parse(var),datetime.datetime),
                     parse(var),"Error")             
        except:
            return "You entered an incorrect date.  Check your date format."

    @staticmethod
    def dateformatter(datearray):
        """Function to format strings for numpy arange"""
        return parse(datearray).strftime("%Y-%m-%d")

    @staticmethod
    def dateRanger(originalArray):
        """Function to vectorize date formatting function.
        Creates datetime.date objects for each day in the range
        and stores in a numpy array.

        Example

        Parameters
            ----------
            originalArray : {array-like}, List of date strings \
            to query.
        Returns
        -------
        self : object
            Returns array.
        """
        if isinstance(originalArray,str):
            """Check user input to retrieve date query."""

            return np.where(len(originalArray)==0,np.array(datetime.datetime.now()),
                     parse_date(originalArray))

        elif isinstance(originalArray,list):
            if len(originalArray)==1:
                return np.array(parse("".join(originalArray)))
            elif len(originalArray)>2:
                return np.array(map(parse,originalArray),dtype='datetime64[D]')
            else:
                cleaner = np.vectorize(dateformatter)
                converted = cleaner(originalArray).tolist()
                dates = np.arange(converted[0],converted[1],dtype='datetime64[D]')
                dates = np.append(dates,np.datetime64(datetime.date.today())) 
                return dates
    @staticmethod
    def gdeltRangeString(element):
        if element == datetime.date.today():
            multiplier = datetime.datetime.now().minute / 15
            multiple = 15 * multiplier
            converted = datetime.datetime.now().replace(minute=multiple,second=0)
        else:
            converted = (datetime.datetime.combine(element,datetime.time.min) + 
                datetime.timedelta(
                                    minutes=45,hours=23
                                    )
                                   )
        return converted.strftime('%Y%m%d%H%M%S')


    @staticmethod
    def vectorizer(function,dateArray):
        helper = np.vectorize(function)
        return helper(dateArray.tolist()).tolist()

    # Finds the urls from an array of dates

    @staticmethod
    def UrlFinder(targetDate):
        return masterdf[masterdf[2].str.contains(targetDate)]

    @staticmethod
    def vectorizedUrlFinder(function,urlList):
        helper=np.vectorize(function)
        return pd.concat(helper(urlList).tolist())

    @staticmethod
    def downloadVectorizer(function,urlList):
        '''
        test2 = downloadVectorizer(downloadAndExtract,b)
        test2.columns=gkgHeaders.tableId.tolist()
        '''
        helper=np.vectorize(function)
        return pd.concat(helper(urlList).tolist())

In [90]:
gdelt = gdeltSearch()

URLS

The main urls that we need to hit to return data.


In [91]:
masterListUrl = 'http://data.gdeltproject.org/gdeltv2/masterfilelist.txt'
baseUrl = 'http://data.gdeltproject.org/gdeltv2/'

Parameters and Global Variables

Section contains variables that will be self. objects in the classes.


In [ ]:


In [92]:
'''
Listing of all GDELT 15 minute dumps. Code retrieves the list,
splits it on the new line character, and then splits on the space. 
We delete the last entry because it's empty.  
'''
directory = requests.get(masterListUrl)
clean = directory.content.split('\n')
clean = map(lambda x: x.split(' '),clean)
del clean[-1]

"""
Setting up the master list as dataframe for querying
this will be inside the class
"""
masterdf = pd.DataFrame(clean)
masterdf.fillna('',inplace=True)

In [93]:
# # dayFull = downloadVectorizer(downloadAndExtract,masterdf[(masterdf[2].str.contains('/20160926')) & (masterdf[2].str.contains('export'))][2].unique().tolist())
# # dayFull.columns = eventsDbHeaders.tableId.tolist()

# # dayFullMentions = downloadVectorizer(downloadAndExtract,masterdf[(masterdf[2].str.contains('/20160926')) & (masterdf[2].str.contains('mentions'))][2].unique().tolist())
# # dayFullMentions.columns = mentionsHeaders.tableId.tolist()

# # finalFull = dayFull.merge(dayFullMentions,how='outer',on='GLOBALEVENTID')
# # finalFull.reset_index(inplace=True,drop=True)

# finalFull.EventCode>=190

# finalFull[
#     (finalFull.Confidence > 75) & 
#     (finalFull.ActionGeo_Lat.notnull()) & 
#     (finalFull.GoldsteinScale<-5) &
#     (finalFull.ActionGeo_CountryCode=='SY') &
#     (finalFull.EventCode==193)
#         ].SOURCEURL.unique()

# finalFull.columns.tolist()

# finalFull[]

# finalFull[(finalFull.ActionGeo_CountryCode=='SY') & 
#           (finalFull.Actor1Name.notnull()) &
#           (finalFull.Confidence >70) &
#           (finalFull.EventCode>190) &
#           (finalFull.Actor1Name.str.contains('TERRORIST'))].SOURCEURL.unique()

In [94]:
# table type = tblType
import datetime
import pandas as pd

graph = 'gkg'
events = 'events' # includes new GDELT 2.0 mentions table; merged on globaleventid

tblType = events  # default to events db

Date Parameters that will be entered

Location to hold testing spot for all the different type of parameters that can be entered.


In [95]:
defaultDateEntry = "" # string
stringDateEntry = " 2016 09 18" # string
historicalDateEntry = "2015 02 25" #string
errorDate = "What in the heck" # error string
listOfdates = ['Sep 1 2016','2016 09 24'] # list, len 2
moreThanTwo= ['Sept 20 2016','June 3 2011','January 1, 2013'] # list, len greater than 2d

date = defaultDateEntry
time = ""

In [96]:
date


Out[96]:
''

Setting the values for the headers

Headers are set based on tblType value passed in. Will default to the events DB headers.


In [97]:
gkgHeaders = pd.read_csv(
    '../utils/schema_csvs/GDELT_2.0_gdeltKnowledgeGraph_Column_Labels_Header_Row_Sep2016.tsv',
    delimiter='\t',usecols=['tableId','dataType','Description']
    )
gkgHeaders.tableId.tolist();

eventsDbHeaders = pd.read_csv('../utils/schema_csvs/GDELT_2.0_Events_Column_Labels_Header_Row_Sep2016.csv',
                         delimiter=',',usecols=['tableId','dataType','Description'])
eventsDbHeaders.tableId.tolist();

mentionsHeaders = pd.read_csv('../utils/schema_csvs/GDELT_2.0_eventMentions_Column_Labels_Header_Row_Sep2016.tsv',
                         delimiter='\t',usecols=['tableId','dataType','Description'])
mentionsHeaders.tableId.tolist();

In [98]:
datetime.datetime.now().strftime('%m-%d-%Y')


Out[98]:
'10-23-2016'

Checking Inputs of functions and parameters

We need to see how many dates are passed into the function. Use the logic above.


In [99]:
import traceback,sys
import datetime
from dateutil.parser import parse
import numpy as np

def dateInputCheck(date):
    """Function to check date entered by user.
    
    Example
    
    Parameters
        ----------
        date : {string or list}, 
            Input data, where ``date`` is a single date string, 
            two dates representing a range, or several dates \
            that represent individual days of interest.
    Returns
    -------
    self : None
        Returns self.
    """
    
    if isinstance(date,str):
        if date != "":
            if parse(date) > datetime.datetime.now():
                raise ValueError('Your date is greater than the current date.\
                Please enter a relevant date.')
            elif parse(date)<parse('Feb 18 2015'):
                raise ValueError('GDELT 2.0 only supports \'Feb 18 2015 - Present\'\
                queries currently. Try another date.')

    elif isinstance(date,list):
        if len(date)==1:
            try:
                if parse("".join(date)) > datetime.datetime.now():
                    raise ValueError('Your date is greater than the current\
                    date.  Please enter a relevant date.')
                elif parse("".join(date)) < parse('Feb 18 2015'):
                    raise ValueError('GDELT 2.0 only supports \'Feb 18 2015 - Present\' \
                    queries currently. Try another date.')
            except:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                traceback.print_tb(exc_traceback, limit=1, file=sys.stdout)
                traceback.print_exception(exc_type, exc_value, exc_traceback,
                                          limit=2, file=sys.stdout)
                raise ValueError("One or more of your input date strings does \
                not parse to a date format. Check input.")

        
        elif len(date)==2:
            try:
                map(parse,date)
            except Exception as exc:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                traceback.print_tb(exc_traceback, limit=1, file=sys.stdout)
                traceback.print_exception(exc_type, exc_value, exc_traceback,
                                          limit=2, file=sys.stdout)
                raise ValueError("One or more of your input date strings \
                does not parse to a date format. Check input.")

            if bool(parse(date[0])<parse(date[1])) == False:
                raise ValueError('Start date greater than end date. Check date \
                strings.')
                
            if np.all(
                    np.logical_not(np.array(map(parse,date))> datetime.datetime.now())
                    ) == False:
                raise ValueError("One of your dates is greater than the current \
                date. Check input date strings.")

            
        elif len(date)>2:

            try:
                map(parse,date)
            except Exception as exc:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                traceback.print_tb(exc_traceback, limit=1, file=sys.stdout)
                traceback.print_exception(exc_type, exc_value, exc_traceback,
                                          limit=2, file=sys.stdout)
                raise ValueError("One or more of your input date strings does \
                not parse to a date format. Check input.")
                
            if np.all(
                    np.logical_not(np.array(map(parse,date))> datetime.datetime.now())
                    ) == False:
                raise ValueError("One or more of your input date strings does not \
                parse to a date format. Check input.")

In [100]:
np.arange('2016-10-11', '2016-10-16', dtype='datetime64[D]').tolist()


Out[100]:
[datetime.date(2016, 10, 11),
 datetime.date(2016, 10, 12),
 datetime.date(2016, 10, 13),
 datetime.date(2016, 10, 14),
 datetime.date(2016, 10, 15)]

In [101]:
tester = np.append((np.arange('2016-10-11', '2016-10-16', dtype='datetime64[D]')).tolist(),datetime.datetime.now().date().today()).tolist()

In [102]:
tester


Out[102]:
[datetime.date(2016, 10, 11),
 datetime.date(2016, 10, 12),
 datetime.date(2016, 10, 13),
 datetime.date(2016, 10, 14),
 datetime.date(2016, 10, 15),
 datetime.date(2016, 10, 23)]

In [103]:
datetime.datetime.now().date().today() in tester


Out[103]:
True

Checking the tblType input


In [104]:
# gets the urls from array
# resultMaster = vectorizedUrlFinder(UrlFinder,datesToPull)


def tblCheck(tbl):
    '''Checking the input of tblType.'''
    if tbl == 'events' or tbl == '' or tbl == 'mentions':
        resultsUrlList = resultMaster[2][resultMaster[2].str.contains('export|mentions')]
    elif tbl == 'gkg':
        resultsUrlList = resultMaster[2][resultMaster[2].str.contains('gkg')]
    else:
        raise ValueError ("Incorrect parameter \'{0}\' entered.  Did you mean to use \'{0}\' as the parameter?\nPlease check your \'tblType\' parameters.".format(tblType))
    return resultsUrlList

Date Functionality (Date ranges)

Use the numpy date range functionality to create strings of dates between ranges in a list. Then, use the dateutil tool to parse those strings into the correct format. Then run a query for each date, return the dataframe, and concatenate into a single one.

  • Logic
    • If length of passed in date less than zero, raise error
    • If length is equal to one, find that one date's table or graph
    • If length equal to two:
      • if dates are chronological, covert to numpy range and pull all tables or graphs, but raise warning for long ranges
      • if dates are not chronological, get individual dates
    • If length greater than two, get the individual dates
      • initially, return the latest time
      • add option to return closest 15 minute interval to passed in time

Code Pieces and Functions


In [105]:
# numpy example of ranging the date
np.arange('2016-08-01', '2016-09-16', dtype='datetime64[D]')


Out[105]:
array(['2016-08-01', '2016-08-02', '2016-08-03', '2016-08-04',
       '2016-08-05', '2016-08-06', '2016-08-07', '2016-08-08',
       '2016-08-09', '2016-08-10', '2016-08-11', '2016-08-12',
       '2016-08-13', '2016-08-14', '2016-08-15', '2016-08-16',
       '2016-08-17', '2016-08-18', '2016-08-19', '2016-08-20',
       '2016-08-21', '2016-08-22', '2016-08-23', '2016-08-24',
       '2016-08-25', '2016-08-26', '2016-08-27', '2016-08-28',
       '2016-08-29', '2016-08-30', '2016-08-31', '2016-09-01',
       '2016-09-02', '2016-09-03', '2016-09-04', '2016-09-05',
       '2016-09-06', '2016-09-07', '2016-09-08', '2016-09-09',
       '2016-09-10', '2016-09-11', '2016-09-12', '2016-09-13',
       '2016-09-14', '2016-09-15'], dtype='datetime64[D]')

In [106]:
#############################################
# Parse the date
#############################################


from dateutil.parser import parse
import pandas as pd
import numpy as np 
import requests
import datetime



def parse_date(var):
    """Return datetime object from string."""
    
    try:
        return np.where(isinstance(parse(var),datetime.datetime),
                 parse(var),"Error")             
    except:
        return "You entered an incorrect date.  Check your date format."


def dateFormatter(datearray):
    """Function to format strings for numpy arange"""
    return parse(datearray).strftime("%Y-%m-%d")
    

def dateRanger(originalArray):
    """Function to vectorize date formatting function.
    Creates datetime.date objects for each day in the range
    and stores in a numpy array.
    
    Example
    
    Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and
            ``n_features`` is the number of features.
    Returns
    -------
    self : object
        Returns self.
    """
    if isinstance(originalArray,str):
        """Check user input to retrieve date query."""
    
        return np.where(len(originalArray)==0,np.array(datetime.datetime.now()),
                 parse_date(originalArray))
    
    elif isinstance(originalArray,list):
       
        if len(originalArray)==1:
            return np.array(parse("".join(originalArray)))
        elif len(originalArray)>2:
#           
#             return np.array(map(parse,originalArray),dtype='datetime64[D]').tolist()
            return np.array(map(lambda x: parse(x),originalArray))
        else:
            
            cleaner = np.vectorize(dateFormatter)
            converted = cleaner(originalArray).tolist()
            dates = np.arange(converted[0],converted[1],dtype='datetime64[D]')
            dates = map(lambda x: datetime.datetime.combine(x,datetime.datetime.min.time()),dates.tolist())
            if len(originalArray)==2:
                adder = np.datetime64(parse(converted[1]).date())
                adder = datetime.datetime.combine(adder.tolist(),datetime.datetime.min.time())
                return np.append(dates,adder) # numpy range is not endpoint inclusive
            else:
                pass
            return np.array(dates)

# def gdeltRangeString(element,coverage = None):
#     """Takes a numpy datetime and converts to string"""
    
#     ########################
#     # Numpy datetime to object
#     ########################
#     element = element.tolist()
    
#     #########################
#     # Current day check
#     #########################
#     if element == datetime.datetime.now().date():
#         hour = datetime.datetime.now().hour
#         multiplier = datetime.datetime.now().minute / 15
#         multiple = 15 * multiplier
#         conditioner =  multiplier +1
#         converted = datetime.datetime.now().replace(minute=multiple,second=0).strftime('%Y%m%d%H%M%S')
        
#         ####################
#         # Check for full data
#         ####################
#         if coverage:
#             converted = restOfDay = np.array(
#         map(
#             lambda x: np.datetime64(parse(str(element)+" "+ x)
#                                    ).tolist().strftime(
#                 '%Y%m%d%H%M%S'
#             ),times[:hour*4+conditioner]))
            
        
        
#     else:
        
#         if isinstance(element,list)==True:
#             converted = map(lambda x: x,element)
#         else:
#             converted = (datetime.datetime.combine(element,datetime.time.min) + 
#                 datetime.timedelta(
#                                     minutes=45,hours=23
#                                     )
#                                    ).strftime('%Y%m%d%H%M%S')
#         if coverage:
            
#             converted = restOfDay = np.array(
#             map(
#                 lambda x: np.datetime64(parse('2016 10 15 '+ x)
#                                        ).tolist().strftime(
#                     '%Y%m%d%H%M%S'
#                 ),times[:]))
            

            
#     return converted


def vectorizer(function,dateArray):
    helper = np.vectorize(function)
    
    final = helper(dateArray.tolist()).tolist()
    
    if isinstance(final,list):
        
        final = list(set(final))
    elif isinstance(final,str):
        final=final
    else:
        pass
    
    return final

# Finds the urls from an array of dates


def UrlFinder(targetDate):
    return masterdf[masterdf[2].str.contains(targetDate)]


def vectorizedUrlFinder(function,urlList):
    helper=np.vectorize(function)
    return pd.concat(helper(urlList).tolist())


def downloadVectorizer(function,urlList):
    '''
    test2 = downloadVectorizer(downloadAndExtract,b)
    test2.columns=gkgHeaders.tableId.tolist()
    '''
    helper=np.vectorize(function)
    return pd.concat(helper(urlList).tolist())


date = datetime.datetime.now().strftime('%Y %m %d')
date2 = "2016 Mar 13"
date3 = ["2016 Oct 1",'2016 Oct 15']
date4 = ["2016 Jan 1",'2016 Oct 15','2016 Apr 3']
date5 = ['2016 Jan 20','2016 Apr 3', "2015 Jun 8", '2015 25 Dec']


minutes =(map(str,range(00,60,15)))
hours = (map(str,range(0,24)))
times =[]
for l in hours:
    if int(l)<10:
        l="0"+l
    for k in minutes:
        if k == "0":
            k='00'
        times.append('{0}:{1}'.format(l,k))

Logic

  • Enter string date
  • parse date
  • check date (list or individual, valid or not)
  • check for 15 minute request or full day request
  • generate GDELT valid date string
  • generate GDELT valide url
  • download urls
  • add headers
  • concatenate into dataframe
  • output data in dataframe, csv, json, excel, stata,gbq, sql etc.

Fixed RangeString


In [110]:
import warnings

def gdeltRangeString(element,coverage = None, version=2.0):
    """Takes a numpy datetime and converts to string"""
    
    ########################
    # Numpy datetime to object
    ########################

    
    ########################
#     Current day check
    ########################
    
    element = element.tolist()
    
    hour = datetime.datetime.now().hour     
    multiplier = datetime.datetime.now().minute / 15
    multiple = 15 * multiplier
    conditioner =  multiplier +1
    
    
    
    if isinstance(element,list)==False:
        
        
        if element.date() == datetime.datetime.now().date():
            if coverage and int(version) !=1:
                print "coverage current"
                converted = np.array(
                        map(
                            lambda x: np.datetime64(parse(str(element)+" "+ x)
                            ).tolist().strftime(
                                                '%Y%m%d%H%M%S'
                                                ),times[:hour*4+conditioner]))
            else:
                converted = datetime.datetime.now().replace(minute=multiple,second=0).strftime('%Y%m%d%H%M%S')
                
        else:
            if coverage and int(version) !=1:
                
                converted = restOfDay = np.array(
                map(
                    lambda x: np.datetime64(parse(str(element)+" "+ x)
                                           ).tolist().strftime(
                        '%Y%m%d%H%M%S'
                    ),times[:]))
            else:
            

                converted = element.replace(minute=multiple,second=0).strftime('%Y%m%d%H%M%S')
        
        
    #################################
    # All non-current dates section
    #################################    
    
    else:
        
        ####################
        # Handling list
        ####################  

        if isinstance(element,list)==True:
            
#             converted = map(lambda x: x.strftime('%Y%m%d%H%M%S'),element)
            converted = map(lambda x: (datetime.datetime.combine(x,datetime.time.min) + 
                datetime.timedelta(
                                    minutes=45,hours=23
                                    )
                                   ).strftime('%Y%m%d%H%M%S'),element)
        else:
            print "i'm here"
            converted = (datetime.datetime.combine(element,datetime.time.min) + 
                datetime.timedelta(
                                    minutes=45,hours=23
                                    )
                                   ).strftime('%Y%m%d%H%M%S')
        
        ####################
        # Return all 15 min intervals
        # ignore this for version 1
        #################### 
        if coverage and int(version) !=1:
            
            converted = []
            for i in element:
 
                converted.append(np.array(
                map(
                    lambda x: np.datetime64(parse(str(i)+" "+ x)
                                           ).tolist().strftime(
                        '%Y%m%d%H%M%S'
                    ),times[:])))
            converted = np.concatenate(converted,axis=0)
            if len(converted.tolist())>=(3*192):
                warnings.warn('\n\nThis query will download {0} files, and likely exhaust your memory with possibly 10s of GBs of data in this single query.  Hit Ctr-C to kill this query if you do not want to continue.'.format(len(converted.tolist())))
    
    ########################
    # Version 1 Datestrings
    #########################
    if int(version)==1:
        if isinstance(converted,list)==True:
            print "hey"
            converted = list(map(lambda x: np.where((parse(x) >= parse('2013 04 01')),parse(x).strftime('%Y%m%d%H%M%S')[:8],
                      np.where((parse(x) <parse('2006 01 01') and (int(version)==1)),
                               parse(x).strftime('%Y%m%d%H%M%S')[:4],parse(x).strftime('%Y%m%d%H%M%S')[:6]))
                ,converted))
            converted = map(lambda x: x.tolist(),converted)
            converted = list(set(converted)) # account for duplicates
        else:
            converted = np.where((parse(converted) >= parse('2013 04 01')),parse(converted).strftime('%Y%m%d%H%M%S')[:8],
                      np.where((parse(converted) <parse('2006 01 01') and (int(version)==1)),
                               parse(converted).strftime('%Y%m%d%H%M%S')[:4],parse(converted).strftime('%Y%m%d%H%M%S')[:6])).tolist()


            
    return converted

In [113]:
gdeltRangeString(dateRanger(date5),version=1)


hey
Out[113]:
['20160120', '20160403', '20150608', '20151225']

In [108]:
# gdeltRangeString(dateRanger(['2013 01 18','2013 March 2']),version=1,coverage=True)
b = gdeltRangeString(dateRanger(date3),version=2,coverage=True)


/Users/linwood/anaconda2/envs/gdelt_dev/lib/python2.7/site-packages/ipykernel/__main__.py:97: UserWarning: 

This query will download 1440 files, and likely exhaust your memory with possibly 10s of GBs of data in this single query.  Hit Ctr-C to kill this query if you do not want to continue.

In [29]:
pd.DataFrame(gdeltRangeString(dateRanger(date3),version=1)[-3:])


Out[29]:
0
0 20161001
1 20161002
2 20161003

Fixing the Dates so we don't need to download master


In [30]:
gdeltRangeString(dateRanger(date5),version=2)


Out[30]:
['20160120234500', '20160403234500', '20150608234500', '20151225234500']

In [31]:
urls2 = pd.DataFrame(gdeltRangeString(dateRanger(date),version=2,coverage=True))[0].apply(
    lambda x: "http://data.gdeltproject.org/gdeltv2/"+x+".export.CSV.zip").values.tolist()


coverage current

In [32]:
urls1 = pd.DataFrame(gdeltRangeString(dateRanger(date3),version=1)[-3:])[0].apply(
    lambda x: "http://data.gdeltproject.org/events/"+x+".export.CSV.zip").values.tolist()

In [119]:
from functools import partial
from io import BytesIO
v1RangerCoverage = partial(gdeltRangeString,version=1,coverage=True)
v2RangerCoverage = partial(gdeltRangeString,version=2,coverage=True)
v1RangerNoCoverage = partial(gdeltRangeString,version=1,coverage=False)
v2RangerNoCoverage = partial(gdeltRangeString,version=2,coverage=False)

In [34]:
# frame = pd.read_csv(BytesIO(r.content),compression='zip',sep='\t',header=None)

In [35]:
# frame.head();

In [ ]:


In [36]:
def parallelDownloader(h, w, max_iteration = 1000):
    #make a helper function that better supports pool.map by using only 1 var
    #This is necessary since the version
    partialCalcRow = partial(mandelbrotCalcRow, h=h, w=w, max_iteration = max_iteration)
 
    pool =multiprocessing.Pool() #creates a pool of process, controls worksers
    #the pool.map only accepts one iterable, so use the partial function
    #so that we only need to deal with one variable.
    mandelImg = pool.map(partialCalcRow, xrange(h)) #make our results with a map call
    pool.close() #we are not adding any more processes
    pool.join() #tell it to wait until all threads are done before going on
    
    pool = Pool(processes = cpu_count())
    results = list(pool.imap_unordered(mp_worker,urls2))
    pool.close()
    pool.join()
 
    return mandelImg

In [37]:
# df = pd.concat(results).reset_index(drop=True)

In [ ]:


In [78]:
def urlBuilder(dateString,version,table='events'):
    '''
    Takes date string from gdeltRange string and creates GDELT urls
    
    
    Parameters
    ------------
    
    table types:
                * events and mentions (default)
                * gkg
                * mentions only
    '''
    if version == 2:
        base = "http://data.gdeltproject.org/gdeltv2/"
        
    if version == 1:
        base = "http://data.gdeltproject.org/events/"
    
    
        
    if table == "events":
        caboose = ".export.CSV.zip"
    elif table == "mentions":
        caboose = ".mentions.CSV.zip"
    elif table == "gkg":
        caboose = ".gkg.csv.zip"
    else:
        raise ValueError('You entered an incorrect GDELT table type.'\
                        ' Choose between \"events\",\"mentions\",and \"gkg\".')
        
   
    if isinstance(dateString,list) == True or isinstance(dateString,np.ndarray)==True:
        return map(lambda x: base+x+caboose,dateString)
    elif isinstance(dateString,str)== True or len(dateString)==1:
        if isinstance(dateString,list) == True or isinstance(dateString,np.ndarray)==True:
            dateString = dateString[0]
        return "on the string",base+dateString+caboose

In [116]:
from functools import partial

version2Coverage = partial(gdeltRangeString,coverage=True,version=2)
version2NoCoverage = partial(gdeltRangeString,coverage=None,version=2)
version1 = partial(gdeltRangeString,version=1)

In [120]:
urlsv2mentions=partial(urlBuilder,version=2,table='mentions')
urlsv1mentions=partial(urlBuilder,version=1,table ='mentions')
urlsv2events=partial(urlBuilder,version=2,table='events')
urlsv1events=partial(urlBuilder,version=1,table ='events')
urlsv2gkg=partial(urlBuilder,version=2,table='gkg')

In [123]:
urlsv2events(v2RangerCoverage(dateRanger(date5)));

In [43]:
from io import BytesIO
import requests
import os
from multiprocessing import Pool, current_process,cpu_count
import datetime

def mp_worker(url):
    start = datetime.datetime.now()
    proc_name = current_process().name
    print multiprocessing.current_process().name
    proc = os.getpid()
    print ('Starting {0}-{1}'.format(proc_name,proc))
    r = requests.get(url)
    print multiprocessing.Process(name=multiprocessing.current_process().name).is_alive()
    frame = pd.read_csv(BytesIO(r.content),compression='zip',sep='\t',header=None)
    end = datetime.datetime.now() - start
    print "{0} with id {1} finished processing in {2}".format(proc_name,proc,end)
    return frame

In [125]:
r = requests.get('http://data.gdeltproject.org/gdeltv2/20160120000000.export.CSV.zip')
r.url


Out[125]:
u'http://data.gdeltproject.org/gdeltv2/20160120000000.export.CSV.zip'

In [51]:
%reset


Once deleted, variables cannot be recovered. Proceed (y/[n])? y

In [114]:
downloads = urlsv2gkg(v2RangerNoCoverage(dateRanger(date5)))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-114-68cb7f0a5ce6> in <module>()
----> 1 downloads = urlsv2gkg(v2RangerNoCoverage(dateRanger(date5)))

NameError: name 'urlsv2gkg' is not defined

In [48]:
import multiprocessing

pool = Pool(processes = cpu_count())

results = list(pool.imap_unordered(mp_worker,downloads))


pool.close()
pool.terminate()
pool.join()


answer = pd.concat(results)


PoolWorker-6
PoolWorker-5
PoolWorker-8
Starting PoolWorker-6-2741
PoolWorker-7
Starting PoolWorker-5-2740
Starting PoolWorker-7-2742
Starting PoolWorker-8-2743
False
PoolWorker-8 with id 2743 finished processing in 0:00:01.475307
False
False
PoolWorker-6 with id 2741 finished processing in 0:00:02.756037
False
PoolWorker-7 with id 2742 finished processing in 0:00:03.099380
PoolWorker-5 with id 2740 finished processing in 0:00:04.109154

In [50]:
print pool.terminate()


None

In [46]:
answers = pd.concat(results)
del results

In [47]:
answers.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 8918 entries, 0 to 3623
Data columns (total 27 columns):
0     8918 non-null object
1     8918 non-null int64
2     8918 non-null int64
3     8917 non-null object
4     8918 non-null object
5     1189 non-null object
6     1189 non-null object
7     7981 non-null object
8     7981 non-null object
9     7061 non-null object
10    7052 non-null object
11    7075 non-null object
12    6993 non-null object
13    6766 non-null object
14    6540 non-null object
15    8918 non-null object
16    4303 non-null object
17    8918 non-null object
18    5898 non-null object
19    1330 non-null object
20    158 non-null object
21    1964 non-null object
22    1888 non-null object
23    8590 non-null object
24    6857 non-null object
25    0 non-null float64
26    0 non-null float64
dtypes: float64(2), int64(2), object(23)
memory usage: 1.9+ MB

In [ ]:
pool = Pool(processes = cpu_count())
%time results = list(pool.imap_unordered(requests.get,urlsv2gkg(v2RangerNoCoverage(dateRanger(date5)))))
pool.close()
pool.join()

In [205]:
r = requests.get('http://data.gdeltproject.org/gdeltv2/20160313000000.mentions.CSV.zip')
df = pd.read_csv(BytesIO(r.content),compression='zip',sep='\t',header=None)

In [ ]:
urls = []
if isinstance(b,list):
    print map(lambda x: urlBuilder(x,version=1),b)
elif isinstance(b,str):
    print urlBuilder(b,version=1)

# for l in b:
    
#     urls.append(urlBuilder(l,'events',version=1))

In [ ]:
import pandas as pd
from io import BytesIO
import requests 

dfs = []
for l in urls:
    r = requests.get(l)
    try:
        frame = pd.read_csv(BytesIO(r.content),compression='zip',sep = '\t',header=None)
        dfs.append(frame)
        print frame.shape
        del frame
    except:
        pass
    
df = pd.concat(dfs)

In [ ]:
import multiprocessing as mp
import itertools
import time


def g():
    for el in xrange(50):
        print el
        yield el


def f(x):
    time.sleep(1)
    return x * x

if __name__ == '__main__':
    pool = mp.Pool(processes=4)              # start 4 worker processes
    go = g()
    result = []
    N = 11
    while True:
        g2 = pool.map(f, itertools.islice(go, N))
        if g2:
            result.extend(g2)
            time.sleep(1)
        else:
            break
    print(result)

In [ ]:
result

In [ ]:
import pandas as pd
df = pd.read_csv(urls[0], compression='zip',sep='\t',header=None)

In [ ]:
from blaze import data
import dask.dataframe as dd
import dask.bag as db
from dask import delayed

In [ ]:
def reader(filename):
    return pd.read_csv(filename,compression='zip',sep='\t',header=None)

# build list of delayed pandas csv reads; then read in as dask dataframe

# dfs = [delayed(reader)(fn) for fn in urls]
# df = dd.from_delayed(dfs)

In [ ]:
urls[:20]

In [ ]:
r = requests.get('http://data.gdeltproject.org/gdeltv2/20160313044500.export.CSV.zip')

In [ ]:
from io import BytesIO

df = pd.read_csv(BytesIO(r.content),compression='zip',sep='\t',header = None)

In [ ]:
df.head()

In [ ]:
for l in urls

In [ ]:
import multiprocessing as mp
pool = mp.Pool()
results = list(pool.imap_unordered(mp_worker, urls))
pool.close()
pool.join()

Parallel Download Code


In [ ]:
import multiprocessing
import time
import zipfile
import requests
from io import BytesIO



def mp_worker(url):
    start = datetime.datetime.now()
    r = requests.get(url)
    frame = pd.read_csv(BytesIO(r.content),compression='zip',sep='\t',header=None)
    end = datetime.datetime.now() - start
    return frame
    
    

def mp_handler():
    result = []
    p = multiprocessing.Pool(4)
    dfs = p.imap_unordered(mp_worker,urls)
    if dfs:
        try:
            result.extend(dfs)
            time.sleep(1)
        except:
            pass
        
    else:
        pass
    return result

if __name__ == '__main__':
#     answer = mp_handler()
    mp_handler()
    
    
    
    
# http://stackoverflow.com/questions/5318936/python-multiprocessing-pool-lazy-iteration    
# http://stackoverflow.com/questions/20577472/how-to-keep-track-of-asynchronous-results-returned-from-a-multiprocessing-pool

In [ ]:
import numpy as np
from dateutil.parser import parse
import datetime

dates=['2016 10 17','2016 10 11']
dates2 = np.arange('2016-10-11','2016-10-18',dtype='datetime64[D]')


if np.all(np.logical_not(np.array(map(parse, dates)) > datetime.datetime.now())) == False:
    print "Yes"
    
else:
    print "No"

In [ ]:
np.array(map(lambda x: parse(x),dates),dtype='datetime64[D]')

In [ ]:
np.all(np.logical_not(np.array(map(lambda x: parse(x),dates),dtype='datetime64[D]') >= np.datetime64(datetime.datetime.now().date())))

In [ ]:
(datetime.datetime.now() + datetime.timedelta(days=+1))

In [ ]:
np.datetime64(datetime.datetime.now().date()) - np.timedelta64(1,'D')

In [ ]:
df.info()

In [ ]:
from multiprocessing import Pool
from io import BytesIO
import pandas as pd
import requests

def mp_worker(url):
    start = datetime.datetime.now()
    r = requests.get(url)
    try:
        frame = pd.read_csv(BytesIO(r.content),compression='zip',sep='\t',header=None)
    except:
        pass
    end = datetime.datetime.now() - start
    return frame


def parallelDownload(function,urlList):
    p = Pool()
    results = []
    rs = p.imap_unordered(function, urlList)
    for frame in rs:
        try:
            results.append(frame)
        except:
            pass
    # print results
    # print results[0].head()
    return pd.concat(results).reset_index(drop=True)

if __name__ == '__main__':
    b = parallelDownload(mp_worker,urls)
    p.join()
    p.close()

In [ ]:
b[0]

In [ ]:
import multiprocessing
from io import BytesIO
import pandas as pd
import requests

def worker():
    r.con
    """worker function"""
    print 'Worker'
    return

if __name__ == '__main__':
    jobs = []
    for i in range(5):
        p = multiprocessing.Process(target=worker)
        jobs.append(p)
        p.start()

In [ ]:
dfs = easy_parallize(mp_worker,urls)

In [ ]:
from multiprocessing.pool import Pool
#from multiprocessing.pool import ThreadPool as Pool  # to use threads

with multiprocessing.Pool() as pool:
    answers = []
    for i in urls:
        result1 = pool.apply_async(mp_worker, i)
        answers.append(result1.get())

In [ ]:
ddf = dd.read_csv('/Users/linwood/Downloads/20150218230000.export.CSV.zip',sep='\t',header=None)

In [ ]:
table = ''

if table == 'events' or table == '' or table == 'mentions':
    print "Yes"
elif table == 'gkg':
    print "gkg"
else:
    print "You have entered a wrong table"

In [ ]:
hour = datetime.datetime.now().hour
multiplier = datetime.datetime.now().minute / 15
multiple = 15 * multiplier
conditioner = multiplier +1
np.array(map(lambda x: np.datetime64(parse(str(dateRanger(date3)[-1])+" " + x)).tolist().strftime('%Y%m%d%H%M%S'),times[:hour*4+conditioner])).tolist()

In [ ]:
gdeltRangeString(dateRanger(['2016 Oct 1 10:32']).tolist())

In [ ]:
minutes =(map(str,range(00,60,15)))
hours = (map(str,range(0,24)))
times =[]
for l in hours:
    if int(l)<10:
        l="0"+l
    for k in minutes:
        if k == "0":
            k='00'
        times.append('{0}:{1}'.format(l,k))

In [ ]:
vectorizer(gdeltRangeString,test)

In [ ]:
np.datetime64(parse('2016 10 14'),dtype='M8[h]').tolist()

Working Examples for Single Date Functionality


In [ ]:
date = '2016 9 12'

gdeltRangeString(np.datetime64(parse('2016 10 14')).tolist())

Working Examples of Date Range Functionality


In [ ]:
date=['2016 09 01','2016 09 24']
(dateRanger(date))

In [ ]:
# converts to gd
datesToPull = vectorizer(gdeltRangeString,dateRanger(['2016 Oct 13','2016 Oct 15']))

In [ ]:
datesToPull

In [ ]:
masterdf[masterdf[2].str.contains(datesToPull)]

In [ ]:
# gets the urls from array
resultMaster = vectorizedUrlFinder(UrlFinder,datesToPull)

Testing Area for Dates; Above is good, below is experimental


In [ ]:
tblCheck('gkg')

In [ ]:
test2.reset_index(drop=True,inplace=True)

In [ ]:
datesToPull

Munging Data: Extracting Specific Datasets or all of them

Work with the returned GDELT data. Specific whether we are pulling the mentions, events, or gkg date for the day or all.


In [ ]:
results = match_date(gdelt_timeString(dateInputCheck(date)))

In [ ]:
target = results[2][results[2].str.contains('export')].reset_index(drop=True).ix[0]

In [ ]:
target

In [ ]:
#############################################
# GDELT data download and extraction
#############################################

from StringIO import StringIO
import pandas as pd
import requests
import zipfile
import re

def downloadAndExtract(gdeltUrl):
    """Downloads and extracts GDELT zips without saving to disk"""
    
    response = requests.get(gdeltUrl, stream=True)
    zipdata = StringIO()
    zipdata.write(response.content)
    gdelt_zipfile = zipfile.ZipFile(zipdata,'r')
    name = re.search('(([\d]{4,}).*)',gdelt_zipfile.namelist()[0]).group().replace('.zip',"")
    data = gdelt_zipfile.read(name)
    gdelt_zipfile.close()
    del zipdata,gdelt_zipfile,name,response
    return pd.read_csv(StringIO(data),delimiter='\t',header=None)
    

def add_header(gdeltUrl):
    """Returns the header rows for the dataframe"""
    
    dbType = re.search(
        '(mentions|export|gkg)',
        gdeltUrl
        ).group()
    
    if dbType == "gkg":
        headers = gkgHeaders.tableId.tolist()
    
    elif dbType == "mentions":
        headers = mentionsHeaders.tableId.tolist()
        
    elif dbType == "export":
        headers = eventsDbHeaders.tableId.tolist()
        
    return headers

In [ ]:
target = 'http://data.gdeltproject.org/gdeltv2/20160924150000.export.CSV.zip'

In [ ]:
gdelt_df = downloadAndExtract(target)
gdelt_df.columns = add_header(target)
gdelt_df.info()

In [ ]:
mast

In [ ]:
combined = gdelt_df.merge(gdelt_df2,how='outer',on='GLOBALEVENTID')

In [ ]:
combined.info()

In [ ]:
combined.columns

In [ ]:
# combined.[(combined.Confidence != None) & (combined.MonthYear != None)]
combined[['Actor1Code','Actor1Name']][(combined.GoldsteinScale <= -5.2) & (combined.Actor1Code != "")].fillna('')

Early Pipeline to Write out R Dataframe

Ways to install

pip install feather-format
conda install feather-format -c conda-forge

IT WORKS!!!


In [ ]:
import feather
path = 'my_data.feather'
feather.api.write_dataframe(testdf, path)
newtestdf = feather.api.read_dataframe(path)

Leftovers; Junkyard below (stuff to work on)


In [ ]:
results = masterListdf[masterListdf[2].str.contains(gdelt_timeString(dateInputCheck(date)))==True]

In [ ]:
results[2].reset_index().ix[0][2]

In [ ]:
results[results[2].str.contains('gkg')]

In [ ]:
gdelt_timeString(dateInputCheck(date))

In [ ]:
import re
from dateutil.parser import parse
re.search('(([\d]{4,}).*)',clean[20][-1]).group()

In [ ]:
if bool(4>3):
    print "Hello"

In [ ]:
(datetime.datetime.now().replace(hour=0,minute=0,second=0,microsecond=0)) == parse("2016 09 18" )

In [ ]:
b = dateutil.parser.parse(re.search('([\d]{4,})',clean[20][-1]).group())

In [ ]:
matchDate = re.search('([\d]{4,})',clean[20][-1]).group()

In [ ]:
def time_change(current,diff):
    date = current.replace(minute=0, second=0) + timedelta(minutes=diff)
    return date.strftime("%Y%m%d%H%M%S")

In [ ]:
# pulling most current daily report

import numpy as np
import datetime
from datetime import timedelta

currentTime = datetime.datetime.now()
timeDiff = currentTime.minute / 15 

query = np.where(timeDiff == 1,time_change(currentTime,diff=15),
        np.where(timeDiff == 2, time_change(currentTime,diff=30),
                 np.where(timeDiff == 3, time_change(currentTime,diff=45),
                          time_change(currentTime,diff=0))))

baseUrl = 'http://data.gdeltproject.org/gdeltv2/' + str(query) + '.export.CSV.zip'

In [ ]:
data

In [ ]:
myzipfile.namelist()

In [ ]:
import zipfile


r = requests.get(baseUrl, stream=True)

# with open('gdelt.zip', 'wb') as f:
#     f.write(r.content)
# fh = open('gdelt.zip')
# g = zipfile.ZipFile(fh)
# g.extractall()

from StringIO import StringIO
zipdata = StringIO()
zipdata.write(r.content)
myzipfile = zipfile.ZipFile(zipdata,'r')
data = myzipfile.read(str(query) + '.export.CSV')
gdeltdf = pd.read_csv(StringIO(data),delimiter='\t',header=None)

In [ ]:
gdeltdf.columns=headers.tableId.tolist()

In [ ]:
gdeltdf.SOURCEURL[((gdeltdf.ActionGeo_CountryCode =='SY')|(gdeltdf.ActionGeo_CountryCode =='IZ')) & (gdeltdf.GoldsteinScale < -4)]

In [ ]:
text = '''
GLOBALEVENTID	INTEGER	NULLABLE	This is the ID of the event that was mentioned in the article.
EventTimeDate	INTEGER	NULLABLE	This is the 15-minute timestamp (YYYYMMDDHHMMSS) when the event being mentioned was first recorded by GDELT (the DATEADDED field of the original event record).  This field can be compared against the next one to identify events being mentioned for the first time (their first mentions) or to identify events of a particular vintage being mentioned now (such as filtering for mentions of events at least one week old).
MentionTimeDate	INTEGER	NULLABLE	This is the 15-minute timestamp (YYYYMMDDHHMMSS) of the current update.  This is identical for all entries in the update file but is included to make it easier to load the Mentions table into a database.
MentionType	INTEGER	NULLABLE	This is a numeric identifier that refers to the source collection the document came from and is used to interpret the MentionIdentifier in the next column.  In essence, it specifies how to interpret the MentionIdentifier to locate the actual document.  At present, it can hold one of the following values:o 1 = WEB (The document originates from the open web and the MentionIdentifier is a fully-qualified URL that can be used to access the document on the web).o 2 = CITATIONONLY (The document originates from a broadcast, print, or other offline source in which only a textual citation is available for the document.  In this case the MentionIdentifier contains the textual citation for the document).o 3 = CORE (The document originates from the CORE archive and the MentionIdentifier contains its DOI, suitable for accessing the original document through the CORE website).o 4 = DTIC (The document originates from the DTIC archive and the MentionIdentifier contains its DOI, suitable for accessing the original document through the DTIC website).o 5 = JSTOR (The document originates from the JSTOR archive and the MentionIdentifier contains its DOI, suitable for accessing the original document through your JSTOR subscription if your institution subscribes to it).o 6 = NONTEXTUALSOURCE (The document originates from a textual proxy (such as closed captioning) of a non-textual information source (such as a video) available via a URL and the MentionIdentifier provides the URL of the non-textual original source.  At present, this Collection Identifier is used for processing of the closed captioning streams of the Internet Archive Television News Archive in which each broadcast is available via a URL, but the URL offers access only to the video of the broadcast and does not provide any access to the textual closed captioning used to generate the metadata.  This code is used in order to draw a distinction between URL-based textual material (Collection Identifier 1 (WEB) and URL-based non-textual material like the Television News Archive).
MentionSourceName	STRING	NULLABLE	This is a human-friendly identifier of the source of the document.  For material originating from the open web with a URL this field will contain the top-level domain the page was from.  For BBC Monitoring material it will contain “BBC Monitoring” and for JSTOR material it will contain “JSTOR.”  This field is intended for human display of major sources as well as for network analysis of information flows by source, obviating the requirement to perform domain or other parsing of the MentionIdentifier field.
MentionIdentifier	STRING	NULLABLE	This is the unique external identifier for the source document.  It can be used to uniquely identify the document and access it if you have the necessary subscriptions or authorizations and/or the document is public access.  This field can contain a range of values, from URLs of open web resources to textual citations of print or broadcast material to DOI identifiers for various document repositories.  For example, if MentionType is equal to 1, this field will contain a fully-qualified URL suitable for direct access.  If MentionType is equal to 2, this field will contain a textual citation akin to what would appear in an academic journal article referencing that document (NOTE that the actual citation format will vary (usually between APA, Chicago, Harvard, or MLA) depending on a number of factors and no assumptions should be made on its precise format at this time due to the way in which this data is currently provided to GDELT – future efforts will focus on normalization of this field to a standard citation format).  If MentionType is 3, the field will contain a numeric or alpha-numeric DOI that can be typed into JSTOR’s search engine to access the document if your institution has a JSTOR subscription.
SentenceID	INTEGER	NULLABLE	The sentence within the article where the event was mentioned (starting with the first sentence as 1, the second sentence as 2, the third sentence as 3, and so on).  This can be used similarly to the CharOffset fields below, but reports the event’s location in the article in terms of sentences instead of characters, which is more amenable to certain measures of the “importance” of an event’s positioning within an article.
Actor1CharOffset	INTEGER	NULLABLE	The location within the article (in terms of English characters) where Actor1 was found.  This can be used in combination with the GKG or other analysis to identify further characteristics and attributes of the actor.  NOTE: due to processing performed on each article, this may be slightly offset from the position seen when the article is rendered in a web browser.
Actor2CharOffset	INTEGER	NULLABLE	The location within the article (in terms of English characters) where Actor2 was found.  This can be used in combination with the GKG or other analysis to identify further characteristics and attributes of the actor.  NOTE: due to processing performed on each article, this may be slightly offset from the position seen when the article is rendered in a web browser.
ActionCharOffset	INTEGER	NULLABLE	The location within the article (in terms of English characters) where the core Action description was found.  This can be used in combination with the GKG or other analysis to identify further characteristics and attributes of the actor.  NOTE: due to processing performed on each article, this may be slightly offset from the position seen when the article is rendered in a web browser.
InRawText	INTEGER	NULLABLE	This records whether the event was found in the original unaltered raw article text (a value of 1) or whether advanced natural language processing algorithms were required to synthesize and rewrite the article text to identify the event (a value of 0).  See the discussion on the Confidence field below for more details.  Mentions with a value of “1” in this field likely represent strong detail-rich references to an event.
Confidence	INTEGER	NULLABLE	Percent confidence in the extraction of this event from this article.  See the discussion in the codebook at http://data.gdeltproject.org/documentation/GDELT-Event_Codebook-V2.0.pdf
MentionDocLen	INTEGER	NULLABLE	The length in English characters of the source document (making it possible to filter for short articles focusing on a particular event versus long summary articles that casually mention an event in passing).
MentionDocTone	FLOAT	NULLABLE	The same contents as the AvgTone field in the Events table, but computed for this particular article.  NOTE: users interested in emotional measures should use the MentionIdentifier field above to merge the Mentions table with the GKG table to access the complete set of 2,300 emotions and themes from the GCAM system.
MentionDocTranslationInfo	STRING	NULLABLE	This field is internally delimited by semicolons and is used to record provenance information for machine translated documents indicating the original source language and the citation of the translation system used to translate the document for processing.  It will be blank for documents originally in English.  At this time the field will also be blank for documents translated by a human translator and provided to GDELT in English (such as BBC Monitoring materials) – in future this field may be expanded to include information on human translation pipelines, but at present it only captures information on machine translated materials.  An example of the contents of this field might be “srclc:fra; eng:Moses 2.1.1 / MosesCore Europarl fr-en / GT-FRA 1.0”.  NOTE:  Machine translation is often not as accurate as human translation and users requiring the highest possible confidence levels may wish to exclude events whose only mentions are in translated reports, while those needing the highest-possible coverage of the non-Western world will find that these events often offer the earliest glimmers of breaking events or smaller-bore events of less interest to Western media.o SRCLC. This is the Source Language Code, representing the three-letter ISO639-2 code of the language of the original source material. o ENG.  This is a textual citation string that indicates the engine(s) and model(s) used to translate the text.  The format of this field will vary across engines and over time and no expectations should be made on the ordering or formatting of this field.  In the example above, the string “Moses 2.1.1 / MosesCore Europarl fr-en / GT-FRA 1.0” indicates that the document was translated using version 2.1.1 of the Moses   SMT platform, using the “MosesCore Europarl fr-en” translation and language models, with the final translation enhanced via GDELT Translingual’s own version 1.0 French translation and language models.  A value of “GT-ARA 1.0” indicates that GDELT Translingual’s version 1.0 Arabic translation and language models were the sole resources used for translation.  Additional language systems used in the translation pipeline such as word segmentation systems are also captured in this field such that a value of “GT-ZHO 1.0 / Stanford PKU” indicates that the Stanford Chinese Word Segmenter   was used to segment the text into individual words and sentences, which were then translated by GDELT Translingual’s own version 1.0 Chinese (Traditional or Simplified) translation and language models.
Extras	STRING	NULLABLE	This field is currently blank, but is reserved for future use to encode special additional measurements for selected material.
'''

In [ ]:
from StringIO import StringIO
eventMentions = pd.read_csv(StringIO(text),delimiter='\t',header=None)

In [ ]:
eventMentions.columns=['tableId', 'dataType','Empty', 'Description']

In [ ]:
eventMentions.to_csv('../../gdelt2HeaderRows/schema_csvs/GDELT_2.0_eventMentions_Column_Labels_Header_Row_Sep2016.tsv',encoding='utf-8',sep='\t')

In [ ]:
eventMentions

In [ ]:
gkgdf.to_csv('../../gdelt2HeaderRows/schema_csvs/GDELT_2.0_gdeltKnowledgeGraph_Column_Labels_Header_Row_Sep2016.tsv',encoding='utf-8',sep='\t')

In [ ]:
gkgdf.to_csv('GDELT_2.0_gdeltKnowledgeGraph_Column_Labels_Header_Row_Sep2016.csv',sep='\t',index=False,encoding='utf-8')

In [ ]:
headers.to_csv('GDELT_2.0_Events_Column_Labels_Header_Row_Sep2016.csv', index=False,encoding='utf-8')

In [ ]:
mentionsdf = pd.read_csv(StringIO(text),delimiter='\t',header=None)
mentionsdf.columns=headers.columns.tolist()

In [ ]:


In [ ]:
np.all( np.logical_not(np.array(map(parse, date)) > datetime.datetime.now())) == False:
                raise ValueError:"One of your dates is greater than the current date. Check input date strings."

In [ ]:
dates

Building the classes


In [ ]:
N = 30
x1 = np.linspace(-2, 2, N)
x1[::-1]

In [ ]: