Untitled



In [3]:
%matplotlib inline
import networkx as nx
import csv
import re

In [4]:
i = 0

with open('training.1600000.processed.noemoticon.csv') as f_in:
    for line in f_in:
        print (list(csv.reader(line, skipinitialspace=True)))
        print (line)
        i+=1
        if i>3:
            break


[['0'], ['', ''], ['1467810369'], ['', ''], ['Mon Apr 06 22:19:45 PDT 2009'], ['', ''], ['NO_QUERY'], ['', ''], ['_TheSpecialOne_'], ['', ''], ["@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"], []]
"0","1467810369","Mon Apr 06 22:19:45 PDT 2009","NO_QUERY","_TheSpecialOne_","@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

[['0'], ['', ''], ['1467810672'], ['', ''], ['Mon Apr 06 22:19:49 PDT 2009'], ['', ''], ['NO_QUERY'], ['', ''], ['scotthamilton'], ['', ''], ["is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!"], []]
"0","1467810672","Mon Apr 06 22:19:49 PDT 2009","NO_QUERY","scotthamilton","is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!"

[['0'], ['', ''], ['1467810917'], ['', ''], ['Mon Apr 06 22:19:53 PDT 2009'], ['', ''], ['NO_QUERY'], ['', ''], ['mattycus'], ['', ''], ['@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds'], []]
"0","1467810917","Mon Apr 06 22:19:53 PDT 2009","NO_QUERY","mattycus","@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds"

[['0'], ['', ''], ['1467811184'], ['', ''], ['Mon Apr 06 22:19:57 PDT 2009'], ['', ''], ['NO_QUERY'], ['', ''], ['ElleCTF'], ['', ''], ['my whole body feels itchy and like its on fire '], []]
"0","1467811184","Mon Apr 06 22:19:57 PDT 2009","NO_QUERY","ElleCTF","my whole body feels itchy and like its on fire "


In [6]:
G=nx.Graph()
m=0
n=0

with open('training.1600000.processed.noemoticon.csv') as f_in:
    for line in f_in:
        lineX = list(csv.reader(line, skipinitialspace=True))
        G.add_node(lineX[8][0])
        if '@' in lineX[10][0]:
            m+=1
            for t in re.split('[^a-zA-Z\_\@]', lineX[10][0]):
                if t!='' and t[0]=='@':
                    G.add_edge(lineX[8][0],t[1:])
                    n+=1
        '''if n==10:
            break'''
print(nx.number_of_nodes(G))


---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-6-f3843a786528> in <module>()
      5 
      6 with open('training.1600000.processed.noemoticon.csv') as f_in:
----> 7     for line in f_in:
      8         lineX = list(csv.reader(line, skipinitialspace=True))
      9         G.add_node(lineX[8][0])

/home/ubuntu/anaconda3/lib/python3.6/codecs.py in decode(self, input, final)
    319         # decode input (taking the buffer into account)
    320         data = self.buffer + input
--> 321         (result, consumed) = self._buffer_decode(data, self.errors, final)
    322         # keep undecoded input until the next call
    323         self.buffer = data[consumed:]

UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 7970-7971: invalid continuation byte

In [ ]:
line

In [ ]: