In [37]:
import pandas as pd
import numpy as np
In [38]:
# read in the file
# the main body
data = pd.read_csv('output/twitterDB_all.csv',header=None)
data.columns = ['tweet', 'city']
# dropna
data_clean = data.dropna()
# create a new column
data_clean.loc[:, 'senti_score'] = np.nan
# select the tweet content before the http
import re
regex = '(\shttp[s]:\\\\)'
data_clean.loc[:,'tweet_content'] = data_clean.tweet \
.apply(lambda x:
re.split(regex, x)[0])
# select
regex2 = '\s@.+\:\s'
data_clean.loc[:, 'tweet_content'] = data_clean.tweet_content \
.apply(lambda x:
re.split(regex2, x)[-1])
In [39]:
# sentimental analysis
from textblob import TextBlob
def sentiAnalyze(x):
return TextBlob(x).sentiment[0]
data_clean.loc[:, 'senti_score'] = data_clean.tweet_content \
.apply(lambda x: sentiAnalyze(x))
In [40]:
# dataframe with sentimental score and city names
data_city = data_clean[['city', 'senti_score', 'tweet_content']]
data_city.reset_index(drop=True, inplace=True)
In [43]:
# change city name to country name
import os
import requests
google_api_key = os.getenv('GOOGLE_MAPS_API_KEY')
# country = {}
def getCountry(city):
url = "https://maps.googleapis.com/maps/api/geocode/json?"
params = {'address': city,
'key': google_api_key}
data_json = requests.get(url, params).json()
if data_json:
for entry in data_json['results']:
return entry['formatted_address'].split(', ')[-1]
data_city.loc[:, 'country'] = data_city.loc[:, 'city'].apply(lambda x: getCountry(x))
In [46]:
data_country = data_city[['tweet_content', 'senti_score', 'country']]
In [48]:
twitter = data_country[['country', 'tweet_content', 'senti_score']]
In [ ]:
# wordcount
from string import punctuation
j = reduce(lambda x, y: x + y, twitters.groupby('country_list'))
dic = {}
for n in range(len(j)/2):
dic[str(j[2*n])] = j[2*n+1]
dic2 = {}
for p in dic.iterkeys():
dic2[p] = reduce(lambda x, y: x+y, dic[p]['tweet_content'])
In [51]:
unit = twitter.head(5)
unit
Out[51]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: