In [37]:
import pandas as pd
import numpy as np

In [38]:
# read in the file
# the main body
data = pd.read_csv('output/twitterDB_all.csv',header=None)
data.columns = ['tweet', 'city']
# dropna
data_clean = data.dropna()
# create a new column
data_clean.loc[:, 'senti_score'] = np.nan
# select the tweet content before the http
import re
regex = '(\shttp[s]:\\\\)'
data_clean.loc[:,'tweet_content'] = data_clean.tweet \
                                               .apply(lambda x:
                                                      re.split(regex, x)[0])
# select
regex2 = '\s@.+\:\s'
data_clean.loc[:, 'tweet_content'] = data_clean.tweet_content \
                                        .apply(lambda x:
                                               re.split(regex2, x)[-1])

In [39]:
# sentimental analysis
from textblob import TextBlob
def sentiAnalyze(x):
    return TextBlob(x).sentiment[0]
data_clean.loc[:, 'senti_score'] = data_clean.tweet_content \
                                             .apply(lambda x: sentiAnalyze(x))

In [40]:
# dataframe with sentimental score and city names
data_city = data_clean[['city', 'senti_score', 'tweet_content']]
data_city.reset_index(drop=True, inplace=True)

In [43]:
# change city name to country name
import os
import requests


google_api_key = os.getenv('GOOGLE_MAPS_API_KEY')
# country = {}
def getCountry(city):
    url = "https://maps.googleapis.com/maps/api/geocode/json?"
    params = {'address': city,
              'key': google_api_key}
    data_json = requests.get(url, params).json()
    if data_json:
        for entry in data_json['results']:
            return entry['formatted_address'].split(', ')[-1]


data_city.loc[:, 'country'] = data_city.loc[:, 'city'].apply(lambda x: getCountry(x))

In [46]:
data_country = data_city[['tweet_content', 'senti_score', 'country']]

In [48]:
twitter = data_country[['country', 'tweet_content', 'senti_score']]

In [ ]:
# wordcount
from string import punctuation


j = reduce(lambda x, y: x + y, twitters.groupby('country_list'))
dic = {}
for n in range(len(j)/2):
    dic[str(j[2*n])] = j[2*n+1]
dic2 = {}
for p in dic.iterkeys():
    dic2[p] = reduce(lambda x, y: x+y, dic[p]['tweet_content'])

In [51]:
unit = twitter.head(5)
unit


Out[51]:
country tweet_content senti_score
0 Canada Disturbing footage shows horrific reality for ... -0.3
1 Canada A chess set from cheese cartons. A game of sur... -0.4
2 Canada May Allah curse Bashar al Asad & his allie... 0.0
3 Spain Disturbing footage shows horrific reality for ... -0.3
4 Denmark Absurd at h\u00f8re #dkpol \u00e6vle om at IS ... -0.5

In [ ]:


In [ ]:


In [ ]:


In [ ]: