In [29]:
#!/usr/bin/python
import psycopg2 as pg2
import csv
import re
from datetime import datetime
In [30]:
cfg = { 'host' : '192.168.0.20',
'user' : 'testuser',
'pw' : 'testpass',
'db' : 'dbs'}
def extract_hashtags(s):
# Extract Hashtags from a string with regular expression and
# return a list of those
#source :https://stackoverflow.com/questions/2527892/parsing-a-tweet-to-extract-hashtags-into-an-array-in-python
return re.findall(r"#(\w+)", s)
def string_to_bool(s):
#Convert True and False from string to Boolean
return s == 'True'
def make_timestamp(s):
#Convert a string in a datetime object:
# https://www.postgresql.org/docs/8.0/static/datatype-datetime.html
return datetime.strptime(s, '%Y-%m-%dT%H:%M:%S')
def rm_non_ascii_chars(s):
# Source: https://stackoverflow.com/questions/36598136/remove-all-hex-characters-from-string-in-python
# interesting: http://farmdev.com/talks/unicode/
return s.encode('ascii', errors='ignore')
In [36]:
csv_file_name = 'american-election-tweets2.csv'
data=[]
with open(csv_file_name, 'r', newline='') as f:
#dialect = csv.Sniffer().sniff(f.read(1024))
datareader = csv.reader(f, delimiter=';') #quotechar=''
#['handle', 'text', 'is_retweet', 'original_author', 'time',
#'in_reply_to_screen_name', 'is_quote_status', 'retweet_count',
#'favorite_count', 'source_url', 'truncated']
next(datareader) # skip the header
for row in datareader:
# Format the data and put everything in a nice-looking JSON format
d = dict()
d['handle'] = row[0]
d['text'] = rm_non_ascii_chars(row[1])
d['hashtags'] = extract_hashtags(row[1])
d['is_retweet'] = string_to_bool(row[2])
d['original_author'] = row[3]
d['time'] = make_timestamp(row[4])
d['in_reply_to_screen_name'] = row[5]
d['is_quote_status'] = string_to_bool(row[6])
d['retweet_count'] = int(row[7])
d['favorite_count'] = int(row[8])
d['source_url'] = row[9]
d['truncated'] = string_to_bool(row[10])
data.append(d)
#print(data)
In [15]:
#make hashtags set
hashtags = set()
for tweet in data:
for h in tweet['hashtags']:
hashtags.add(h)
In [41]:
In [42]:
In [45]:
In [ ]: