In [29]:
#!/usr/bin/python

import psycopg2 as pg2
import csv
import re
from datetime import datetime

In [30]:
cfg = { 'host' : '192.168.0.20',
        'user' : 'testuser',
        'pw' : 'testpass',
        'db' : 'dbs'}

def extract_hashtags(s):
    # Extract Hashtags from a string with regular expression and
    # return a list of those
    #source :https://stackoverflow.com/questions/2527892/parsing-a-tweet-to-extract-hashtags-into-an-array-in-python
    return re.findall(r"#(\w+)", s)

def string_to_bool(s):
    #Convert True and False from string to Boolean
    return s == 'True'

def make_timestamp(s):
    #Convert a string in a datetime object:
    # https://www.postgresql.org/docs/8.0/static/datatype-datetime.html
    return datetime.strptime(s, '%Y-%m-%dT%H:%M:%S')

def rm_non_ascii_chars(s):
    # Source: https://stackoverflow.com/questions/36598136/remove-all-hex-characters-from-string-in-python
    # interesting: http://farmdev.com/talks/unicode/
    return s.encode('ascii', errors='ignore')

In [36]:
csv_file_name = 'american-election-tweets2.csv'

data=[]
with open(csv_file_name, 'r', newline='') as f:
    #dialect = csv.Sniffer().sniff(f.read(1024))
    datareader = csv.reader(f, delimiter=';') #quotechar=''
    #['handle', 'text', 'is_retweet', 'original_author', 'time', 
    #'in_reply_to_screen_name', 'is_quote_status', 'retweet_count', 
    #'favorite_count', 'source_url', 'truncated']
    next(datareader) # skip the header
    for row in datareader:
        # Format the data and put everything in a nice-looking JSON format
        d = dict()
        d['handle'] = row[0]
        d['text'] = rm_non_ascii_chars(row[1])
        d['hashtags'] = extract_hashtags(row[1])
        d['is_retweet'] = string_to_bool(row[2])
        d['original_author'] = row[3]
        d['time'] = make_timestamp(row[4])
        d['in_reply_to_screen_name'] = row[5]
        d['is_quote_status'] = string_to_bool(row[6])
        d['retweet_count'] = int(row[7])
        d['favorite_count'] = int(row[8])
        d['source_url'] = row[9]
        d['truncated'] = string_to_bool(row[10])
        data.append(d)
#print(data)

In [15]:
#make hashtags set
hashtags = set()
for tweet in data:
    for h in tweet['hashtags']:
        hashtags.add(h)

In [41]:


In [42]:


In [45]:



---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-45-7590dea24ee5> in <module>()
----> 1 data = chardet.detect(open(csv_file_name, 'r').read())

/usr/lib/python3.5/codecs.py in decode(self, input, final)
    319         # decode input (taking the buffer into account)
    320         data = self.buffer + input
--> 321         (result, consumed) = self._buffer_decode(data, self.errors, final)
    322         # keep undecoded input until the next call
    323         self.buffer = data[consumed:]

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x85 in position 1392: invalid start byte

In [ ]: