In [1]:
import pandas as pd
import arrow # way better than datetime
import numpy as np
import random
import re
%run helper_functions.py
Above, I used the arrow library instead of datetime. In my opinion, Arrow overcomes a lot of the shortfalls and syntactic complexity of the datetime library!
Here is the documentation: https://arrow.readthedocs.io/en/latest/
In [2]:
df = pd.read_csv("tweets_formatted.txt", sep="| |", header=None)
In [3]:
df.shape
Out[3]:
In [4]:
list_of_dicts = []
for i in range(df.shape[0]):
temp_dict = {}
temp_lst = df.iloc[i,0].split("||")
temp_dict['handle'] = temp_lst[0]
temp_dict['tweet'] = temp_lst[1]
try: #sometimes the date/time is missing - we will have to infer
temp_dict['date'] = arrow.get(temp_lst[2]).date()
except:
temp_dict['date'] = np.nan
try:
temp_dict['time'] = arrow.get(temp_lst[2]).time()
except:
temp_dict['time'] = np.nan
list_of_dicts.append(temp_dict)
In [5]:
list_of_dicts[0].keys()
Out[5]:
In [6]:
new_df = pd.DataFrame(list_of_dicts) #magic!
In [7]:
new_df.head() #unsorted!
Out[7]:
In [8]:
new_df.sort_values(by=['date', 'time'], ascending=False, inplace=True)
new_df.reset_index(inplace=True)
del new_df['index']
pickle_object(new_df, "new_df")
In [9]:
new_df.head() #sorted first on date and then on time
Out[9]:
In [15]:
sample_duplicate_indicies = []
for i in new_df.index:
if "Multiplayer #Poker" in new_df.iloc[i, 3]:
sample_duplicate_indicies.append(i)
In [25]:
new_df.iloc[sample_duplicate_indicies, :]
Out[25]:
Let's remove these duplicates in a seperate notebook!
In [ ]: