notebook.community

Edit and run



In [2]:

    
import pandas as  pd
import csv
import operator
import numpy as np
%matplotlib notebook
%matplotlib inline
import re
import json
import matplotlib as mpl   
import matplotlib.pyplot as plt
from __future__ import print_function, division
import nltk
#To install nltk corpus for stop words, un-comment and run the next line.
#nltk.download()
from nltk.corpus import stopwords

data = []
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

##load data
with open('geocoordinatedata.csv', 'rt', encoding='utf8') as csvfile:
    csv.reader(csvfile, delimiter=' ', quotechar='|')
    sort = sorted(csvfile, key=operator.itemgetter(0))

    for eachline in sort:
        t = eachline.split('"created_at":"')[1].split('","contributors_enabled":')[0]
        data.append(t)
#         print(t[0:3], '\n')


## Distribution of days(Mon-Sun)
dataByDay = {}
dataByDay['Mon'] = []
dataByDay['Tue'] = []
dataByDay['Wed'] = []
dataByDay['Thu'] = []
dataByDay['Fri'] = []
dataByDay['Sat'] = []
dataByDay['Sun'] = []

for d in data:
    day = d[0:3]
    if day not in dataByDay:
        dataByDay[day] = []
    dataByDay[day].append(d)

dataByday = []
for key, value in dataByDay.items():
    dataByday.append(len(value))
    
# print(dataByday)
    
    
fig = plt.figure()
ax = fig.add_subplot(1,1,1)

ax.bar(np.array(range(7)), dataByday)

ax.set_title("Distribution of tweets weekly")
ax.set_xlabel('Day')
ax.set_xticks([0,1,2,3,4,5,6])
ax.set_xticklabels(days)
ax.set_ylabel('Amount of tweets')


# print(data[0][11:13], data[0][14:16])
# dummy = data[0][11:13] + data[0][14:15]
# print(int(dummy))









    Out[2]:





<matplotlib.text.Text at 0x121ee95f8>



In [3]:

    
## Tweet distrubution of time period(10mins)
dataPer10Min = {}
for day in days:
    dataPer10Min[day] = {}
    for i in range(24):
        for j in range(6):
            dataPer10Min[day]["%03d"%(i * 10 + j)] = []

for key, value in dataByDay.items():
    for v in value:
        time = "%03d"%int(v[11:13] + v[14:15])
        day = v[0:3]

        if time not in dataPer10Min[day]:
            dataPer10Min[day][time] = []
        dataPer10Min[day][time].append(v)    


    
# dataPer10min = []
# for key, value in dataPer10Min.items():
#     dataB.append(len(value))
    
fig = plt.figure(figsize=(30,50))
timePeriods = []
for i in range(24):
    for j in range(6):
        time = "%02d"%i + ':' + "%02d"%(j * 10)
        timePeriods.append(time)
        
for i in range(7):
    ax = fig.add_subplot(7,1,i + 1)

    dataPer10min = []
    for key, value in dataPer10Min[days[i]].items():
        dataPer10min.append(len(value))
        
    ax.bar(np.array(range(24*6)), dataPer10min)
    
    title = "Distribution of tweets of " + days[i]
    ax.set_title(title, fontsize=30)
    ax.set_xlabel('Time Period', fontsize=20)
    ax.set_xticks(np.array(range(24*6)))
    ax.set_xticklabels(timePeriods, rotation="vertical", fontsize=12)
    ax.set_ylabel('Amount of tweets',fontsize=20)
    
fig.tight_layout()



In [ ]:

    
# For a better understanding, uncomment the print() function along the code and see the results.

import pandas as  pd
import csv
import operator
import numpy as np
%matplotlib notebook
%matplotlib inline
import re
import json
import matplotlib as mpl   
import matplotlib.pyplot as plt
from __future__ import print_function, division
import nltk
#To install nltk corpus for stop words, un-comment and run the next line.
# nltk.download()
from nltk.corpus import stopwords


N = 10 
final_set, a, b = [], [], []
ab = np.empty((43,2))

with open('geocoordinatedata.csv', 'rt', encoding='utf8') as csvfile:
    csv.reader(csvfile, delimiter=' ', quotechar='|')
    sort = sorted(csvfile, key=operator.itemgetter(0))
    
#Reading each line of the file, working on coordinates and creating a "final_set" list that we we will use later
#for processing words.
    for eachline in sort:
        coordinates = eachline.split('":{"type":"Point","coordinates":')[1].split('},"source')[0]
        filtered_tweets = list(filter(bool, re.split('[^a-z]', json.loads(eachline)["text"].lower())))
        final_set.append(filtered_tweets)
        x = (coordinates.split('[')[1].split(',')[0])
        y = (coordinates.split(',')[1].split(']')[0])
        x_float, y_float = float(x), float(y)
        a.append(x_float)
        b.append(y_float)
        a_array, b_array = np.asarray(a), np.asarray(b)
    final = np.stack((a_array, b_array), axis=1)
    #print(final)    

    #In this part we use the "final_set" to process our tweets' data.
    final_set_string = ''.join(str(e) for e in final_set)
    wordList = re.sub("[^\w]", " ",  final_set_string).split()
    #print(wordList)
    #We remove stop words and 'https' which is appearing too often in tweets.
    filteredText = [word for word in wordList if word not in stopwords.words('english') if len(word)>3 if word!='https']      
    #print(filteredText)
    #Creating a dictionary "wordDict" where each word has a key value according to how many times this word appears in the tweets
    results_union = set().union(*final_set)
    wordDict = dict.fromkeys(results_union, 0)
    for word in filteredText:
        wordDict[word] += 1 
    #print(wordDict)
    #We sort the dictionary and choose the top N=10 words with respect to the key value.    
    topwords = dict(sorted(wordDict.items(), key=operator.itemgetter(1), reverse=True)[:(N-len(wordDict))])
    print(topwords)
    #Creating a DataFrame to show the top N=10 words.
    df = pd.DataFrame({'Words':[*topwords]}) 
    #print(df)
    
    #Making the Plot for Tweet Coordinates.
    fig = plt.figure(figsize=(10,5))
    ax = fig.add_subplot(1, 1, 1)
    plt.scatter(final[:,0], final[:,1], label='Exact Tweet Coordinates')
    ax.set_xlabel("Longitude: x")
    ax.set_ylabel("Latitude: y")
    ax.set_title("Tweet Location")
    ax.legend()
    ax.grid("on")
    
    #Making the Plot for top N words of our tweet corpus.
    fig = plt.figure(figsize=(10,5))
    ax = fig.add_subplot(1, 1, 1)
    plt.bar(range(len(topwords)), topwords.values(), align="center")
    plt.xticks(range(len(topwords)), list(topwords.keys()))
    ax.set_xlabel("Popular Tweet words.")
    ax.set_ylabel("Count")
    ax.set_title("Top 10 Words")