notebook.community

Edit and run



In [37]:

    
import pandas as pd
import numpy as np



In [38]:

    
# read in the file
# the main body
data = pd.read_csv('output/twitterDB_all.csv',header=None)
data.columns = ['tweet', 'city']
# dropna
data_clean = data.dropna()
# create a new column
data_clean.loc[:, 'senti_score'] = np.nan
# select the tweet content before the http
import re
regex = '(\shttp[s]:\\\\)'
data_clean.loc[:,'tweet_content'] = data_clean.tweet \
                                               .apply(lambda x:
                                                      re.split(regex, x)[0])
# select
regex2 = '\s@.+\:\s'
data_clean.loc[:, 'tweet_content'] = data_clean.tweet_content \
                                        .apply(lambda x:
                                               re.split(regex2, x)[-1])



In [39]:

    
# sentimental analysis
from textblob import TextBlob
def sentiAnalyze(x):
    return TextBlob(x).sentiment[0]
data_clean.loc[:, 'senti_score'] = data_clean.tweet_content \
                                             .apply(lambda x: sentiAnalyze(x))



In [40]:

    
# dataframe with sentimental score and city names
data_city = data_clean[['city', 'senti_score', 'tweet_content']]
data_city.reset_index(drop=True, inplace=True)



In [43]:

    
# change city name to country name
import os
import requests


google_api_key = os.getenv('GOOGLE_MAPS_API_KEY')
# country = {}
def getCountry(city):
    url = "https://maps.googleapis.com/maps/api/geocode/json?"
    params = {'address': city,
              'key': google_api_key}
    data_json = requests.get(url, params).json()
    if data_json:
        for entry in data_json['results']:
            return entry['formatted_address'].split(', ')[-1]


data_city.loc[:, 'country'] = data_city.loc[:, 'city'].apply(lambda x: getCountry(x))



In [46]:

    
data_country = data_city[['tweet_content', 'senti_score', 'country']]



In [48]:

    
twitter = data_country[['country', 'tweet_content', 'senti_score']]



In [ ]:

    
# wordcount
from string import punctuation


j = reduce(lambda x, y: x + y, twitters.groupby('country_list'))
dic = {}
for n in range(len(j)/2):
    dic[str(j[2*n])] = j[2*n+1]
dic2 = {}
for p in dic.iterkeys():
    dic2[p] = reduce(lambda x, y: x+y, dic[p]['tweet_content'])



In [51]:

    
unit = twitter.head(5)
unit









    Out[51]:






  
    
      
      country
      tweet_content
      senti_score
    
  
  
    
      0
      Canada
      Disturbing footage shows horrific reality for ...
      -0.3
    
    
      1
      Canada
      A chess set from cheese cartons. A game of sur...
      -0.4
    
    
      2
      Canada
      May Allah curse Bashar al Asad &amp; his allie...
      0.0
    
    
      3
      Spain
      Disturbing footage shows horrific reality for ...
      -0.3
    
    
      4
      Denmark
      Absurd at h\u00f8re #dkpol \u00e6vle om at IS ...
      -0.5



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	country	tweet_content	senti_score
0	Canada	Disturbing footage shows horrific reality for ...	-0.3
1	Canada	A chess set from cheese cartons. A game of sur...	-0.4
2	Canada	May Allah curse Bashar al Asad & his allie...	0.0
3	Spain	Disturbing footage shows horrific reality for ...	-0.3
4	Denmark	Absurd at h\u00f8re #dkpol \u00e6vle om at IS ...	-0.5