%matplotlib inline
import networkx as nx
import csv
import re
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from networkx.algorithms.connectivity import minimum_st_edge_cut
from networkx.algorithms.flow import shortest_augmenting_path
from sklearn.cluster import KMeans

def L1 (x,y):
    dist = 0
    if len(x)==len(y):
        for i in range(len(x)):
            dist += math.fabs(x[i]-y[i])
        print('vectors must be equal length for L1')
        return dist

i = 0

with open('training.1600000.processed.noemoticon.csv') as f_in:
    for line in f_in:
        print (list(csv.reader(line, skipinitialspace=True)))
        print (line)
        if i>3:

[['0'], ['', ''], ['1467810369'], ['', ''], ['Mon Apr 06 22:19:45 PDT 2009'], ['', ''], ['NO_QUERY'], ['', ''], ['_TheSpecialOne_'], ['', ''], ["@switchfoot - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"], []]
"0","1467810369","Mon Apr 06 22:19:45 PDT 2009","NO_QUERY","_TheSpecialOne_","@switchfoot - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

[['0'], ['', ''], ['1467810672'], ['', ''], ['Mon Apr 06 22:19:49 PDT 2009'], ['', ''], ['NO_QUERY'], ['', ''], ['scotthamilton'], ['', ''], ["is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!"], []]
"0","1467810672","Mon Apr 06 22:19:49 PDT 2009","NO_QUERY","scotthamilton","is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!"

[['0'], ['', ''], ['1467810917'], ['', ''], ['Mon Apr 06 22:19:53 PDT 2009'], ['', ''], ['NO_QUERY'], ['', ''], ['mattycus'], ['', ''], ['@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds'], []]
"0","1467810917","Mon Apr 06 22:19:53 PDT 2009","NO_QUERY","mattycus","@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds"

[['0'], ['', ''], ['1467811184'], ['', ''], ['Mon Apr 06 22:19:57 PDT 2009'], ['', ''], ['NO_QUERY'], ['', ''], ['ElleCTF'], ['', ''], ['my whole body feels itchy and like its on fire '], []]
"0","1467811184","Mon Apr 06 22:19:57 PDT 2009","NO_QUERY","ElleCTF","my whole body feels itchy and like its on fire "

"""0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
1 - the id of the tweet (2087)
2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)
3 - the query. If there is no query, then this value is NO_QUERY.
4 - the user that tweeted
5 - the text of the tweet"""
cols = ['polarity','tweetID','date','Query','UserID','text']
df = pd.read_csv('training.1600000.processed.noemoticon.csv',names=cols,encoding='latin-1')#names=m_cols ,

# with open('training.1600000.processed.noemoticon.csv', encoding='latin-1') as f_in:
with open('training.1600000.processed.noemoticon.csv') as f_in:
    for line in f_in:
        lineX = list(csv.reader(line, skipinitialspace=True))
        if '@' in lineX[10][0]:
            for t in re.split('[^a-zA-Z\_\@]', lineX[10][0]):
                if t!='' and t[0]=='@' and t!='@':
        if n%100000==0:


cols = ['polarity','tweetID','date','Query','UserID','text']
df = pd.read_csv('training.1600000.processed.noemoticon.csv',names=cols,encoding='latin-1')#names=m_cols ,

In [15]:
G=nx.Graph() # so that we dont destroy G if we start running this cell


for index, row in df.iterrows():
    if '@' in row[5]:
        for t in re.split('[^a-zA-Z\_\@]', row[5]):
            if t!='' and t[0]=='@':

In [16]:
DegList = list(
DegDic = {}
for D in DegList:
    if D in DegDic:
        DegDic[D] += 1
        DegDic[D] = 1

In [19]:
plt.ylabel('Log Count')
plt.title('Log plot of Degree Distribution of Graph')

In [20]:
DegList = list(
for D in DegList:
    if D[1]>3000:

('mileycyrus', 3450)

In [21]:
plt.title('Log-Log of Degree Distribution of Graph')
plt.ylabel('Log Count')
plt.xlabel('Log Degree')
DegList = sorted(DegDic.items())
Xlist, Ylist = zip(*DegList)
plt.loglog(Xlist,Ylist, basex=np.e, basey=np.e)
del Xlist
del Ylist

In [27]:
del DegDic
del DegList

In [9]:
LargestCC = max(nx.connected_component_subgraphs(G), key=len) # largest connected component


In [120]:
scaler = MinMaxScaler((50,800))

CoreCounts = []
for i in range(2,10):
    core_i = nx.k_core(LargestCC, i)
del core_i

In [37]:
#WOWWWWWWWW worth including

[<matplotlib.lines.Line2D at 0x7f8727360208>]

In [10]:
core7 = nx.k_core(LargestCC,7)

In [11]:
Bcent = np.array(list(nx.betweenness_centrality(core7,normalized = True).values()))
scaledBC = scaler.fit_transform(Bcent)

/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/ DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/ DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)

In [12]:
Ecent = np.array(list(nx.eigenvector_centrality_numpy(core7).values()))
scaledEC = scaler.fit_transform(Ecent[:,np.newaxis])

Ccent = np.array(list(nx.closeness_centrality(core7).values()))
scaledCC = scaler.fit_transform(Ccent[:,np.newaxis])

In [17]:
L1_dist = [L1(scaledCC,scaledBC),L1(scaledEC,scaledBC),L1(scaledEC,scaledCC)]
print("""From the three measures we have explored, Eigenvalue centality
Betweenness centality and Closeness centality. We can now evaluate the
L1 distance between the measures""")
D = L1_dist[0]
print("The L1 distance between Closeness centality and Betweenness Centrality is %d implying average distance of %f"%\
      (D, D*1.0/nx.number_of_nodes(core7)))
D = L1_dist[1]
print("The L1 distance between Eigenvalue centality and Betweenness Centrality is %d implying average distance of %f"%\
      (D, D*1.0/nx.number_of_nodes(core7)))
D = L1_dist[2]
print("The L1 distance between Closeness centality and Eigenvalue Centrality is %d implying average distance of %f"%\
      (D, D*1.0/nx.number_of_nodes(core7)))

From the three measures we have explored, Eigenvalue centality
Betweenness centality and Closeness centality. We can now evaluate the
L1 distance between the measures
The L1 distance between Closeness centality and Betweenness Centrality is 386829 implying average distance of 391.527373
The L1 distance between Eigenvalue centality and Betweenness Centrality is 45735 implying average distance of 46.290495
The L1 distance between Closeness centality and Eigenvalue Centrality is 371185 implying average distance of 375.693507

In [11]:
f = nx.fiedler_vector(core7)
s = np.zeros(len(f),dtype='int')

In [12]:
colors = ['#d7191c', '#2b83ba']
node_colors = [colors[s[v]] for v in range(nx.number_of_nodes(core7))]
nx.draw(core7, node_color=node_colors,node_size=10)

In [55]:
L = nx.laplacian_matrix(core7).todense()
w, v = np.linalg.eig(L)
v = np.array(v)
worder = np.argsort(w)
#pos = {i: np.array([f[0], f[1]]) for i, f in enumerate(zip(v[:,worder[1]], v[:,worder[2]]))}

In [56]:
X = v @ np.diag(w)
X = X[:,worder]

In [57]:
error = np.zeros(9)
for k in range(2,11):
    kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10)
    error[k-2] = kmeans.inertia_

In [58]:

[<matplotlib.lines.Line2D at 0x7f61c24a39b0>]

In [68]:
kmeans = KMeans(init='k-means++', n_clusters=6, n_init=10)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_

In [70]:
colors = ['#d7191c', '#ffffbf', '#2b83ba', 'green','orange','maroon']
node_colors = [colors[labels[i]] for i in range(nx.number_of_nodes(core7))]
nx.draw(core7, node_color=node_colors,node_size=10)

/home/ubuntu/anaconda3/lib/python3.6/site-packages/networkx/drawing/ MatplotlibDeprecationWarning: pyplot.hold is deprecated.
    Future behavior will be consistent with the long-time default:
    plot commands add elements without first clearing the
    Axes and/or Figure.
  b = plt.ishold()
/home/ubuntu/anaconda3/lib/python3.6/site-packages/networkx/drawing/ MatplotlibDeprecationWarning: pyplot.hold is deprecated.
    Future behavior will be consistent with the long-time default:
    plot commands add elements without first clearing the
    Axes and/or Figure.
/home/ubuntu/anaconda3/lib/python3.6/site-packages/matplotlib/ UserWarning: axes.hold is deprecated. Please remove it from your matplotlibrc and/or style files.
  warnings.warn(self.msg_depr_set % key)
/home/ubuntu/anaconda3/lib/python3.6/site-packages/matplotlib/ UserWarning: axes.hold is deprecated, will be removed in 3.0
  warnings.warn("axes.hold is deprecated, will be removed in 3.0")

In [23]:
import sklearn
print sklearn.__version__


In [18]:
from sklearn import mixture


In [67]:
vectorizer = TfidfVectorizer(stop_words='english', min_df=8, max_df=0.8)
dtm = vectorizer.fit_transform(TextList)

del TextList
terms = vectorizer.get_feature_names()

In [5]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv',names=cols,encoding='latin-1')#names=m_cols ,

In [25]:
prefix = './trainingandtestdata/'

testfile = prefix + 'testdata.manual.2009.06.14.csv'
trainfile = prefix + 'training.1600000.processed.noemoticon.csv'

In [27]:
df = pd.read_csv(trainfile,names=cols,encoding='latin-1')#names=m_cols ,

In [28]:


In [29]:

polarity tweetID date Query UserID text
0 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY _TheSpecialOne_ @switchfoot - Awww, t...
1 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton is upset that he can't update his Facebook by ...
2 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus @Kenichan I dived many times for the ball. Man...
3 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF my whole body feels itchy and like its on fire
4 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli @nationwideclass no, it's not behaving at all....

In [41]:
df_small = df.iloc[:600000]

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', min_df=8,max_df=0.8)
M = vectorizer.fit_transform(df.text)

In [48]:
print type(M)

<class 'scipy.sparse.csr.csr_matrix'>
<1600000x52521 sparse matrix of type '<type 'numpy.float64'>'
	with 10140434 stored elements in Compressed Sparse Row format>

In [55]:
#doesnt work well

# from scipy import io

# with open('M.mtx','w') as fout:
#     io.mmwrite(fout, M)#, comment='', field=None, precision=None, symmetry=None)[source]

# io.mmwrite('M', M)

In [53]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50, n_iter=10, random_state=42)
X = svd.fit_transform(M)

# Clustering with some parameters.

from sklearn.cluster import KMeans
ncl = 8
k = 30
kmeans = KMeans(n_clusters=ncl, init='k-means++', max_iter=100, n_init=10,random_state=0)
y = kmeans.fit_predict(X[:,:k])
# centroids = kmeans.cluster_centers_
# labels = kmeans.labels_
# error = kmeans.inertia_

for tweets in df[:1000]

In [59]:
df_ = df.copy()

In [131]:
df_['class'] = y_30

In [138]:

33136 rows × 7 columns

In [133]:

In [130]:
classes = {}
for user in df_['UserID'].unique():
    vals = {}
    for i in df_[df_['UserID']==user]['class']:
        if i!=1:
            if i in vals:

        classes[user] = max(vals, key=stats.get)
    except ValueError:
        classes[user] = 1

import statistics
from statistics import StatisticsError
import random
import math

# colors = ['#d7191c', '#ffffbf', '#2b83ba', 'green','orange','maroon','black']
user_class = []
for g in core7:
            Id_Pred = df_[df_['UserID']==user]['class']
            X = statistics.mode(ID_Pred[g])
        except StatisticsError:
    except KeyError:

In [112]:
# df_[df_['UserID']=='ForzaRagazza']
vals = {}
for i in df_[df_['UserID']=='usagiko']['class']:
    if i!=1:
        if i in vals:

max(vals, key=stats.get)


In [100]:
stats = {'a':2,'b':4,'c':1}
max(stats, key=stats.get)


In [97]:

1600000 rows × 7 columns

In [95]:
dfff = pd.DataFrame({'A': [1, 2, 1, 2, 1, 2, 3],'B': [5, 6, 5, 5, 5, 7, 6]})
# df.mode()

In [73]:
# df.columns#

# Clustering with some parameters.

from sklearn.cluster import KMeans
ncl = 8
k = 30
kmeans_30 = KMeans(n_clusters=ncl, init='k-means++', max_iter=100, n_init=10,random_state=0)
y_30 = kmeans_30.fit_predict(X[:,:30])
# centroids = kmeans.cluster_centers_
# labels = kmeans.labels_
# error = kmeans.inertia_

# # GG=nx.Graph() # so that we dont destroy G if we start running this cell

# m=0
# n=0

# for index, row in df_.iterrows():
#     G.add_node(row[4])
#     if '@' in row[5]:
#         m+=1
#         for t in re.split('[^a-zA-Z\_\@]', row[5]):
#             if t!='' and t[0]=='@':
#                 G.add_edge(row[4],t[1:])
#                 n+=1

In [127]:
colors = ['b','w','r','g','c','m','y','k']
# g : green.
# r : red.
# c : cyan.
# m : magenta.
# y : yellow.
# k : black.
# w : white.

In [125]:
labeled_nodes = []
for x in core7.nodes():
    labeled_nodes.append( (x, 1) )
        labeled_nodes[i]=( (x,classes[x]) )
    except KeyError:

In [129]:
fig = plt.figure(figsize=(12,6))
ax = plt.subplot(111)

# labeled_nodes = [(x,classes[x]) for x in core7.nodes()]
# which = np.random.choice(range(len(labeled_nodes)),500)
which = range(len(core7))
# mini_g = core7.subgraph([labeled_nodes[i][0] for i in which])
mini_g = core7
node_colors = [colors[labeled_nodes[i][1]] for i in which]

nx.draw(mini_g, node_color=node_colors,node_size=100, ax=ax, with_labels='False',
       alpha =0.2, font_size=0,width=0.1)
# nx.draw(Gc_core,nodelist=Gc_core.nodes()[:100], node_color=node_colors,node_size=100, ax=ax, with_labels='False',
#        alpha =0.2, font_size=0,width=0.1)
# nx.draw(Gc_core, node_color=node_colors,node_size=10, ax=ax, with_labels='True', font_size=16)

In [110]:
for i in xrange(ncl):
    print list(y_30).count(i)


In [111]:
# #Kmeans

# # Clustering with some parameters.

# from sklearn.cluster import KMeans
# ncl = 20
# k = 10
# kmeans_10_100 = KMeans(n_clusters=ncl, init='k-means++', max_iter=100, n_init=10,random_state=0)
# y_10_100 = kmeans_10_100.fit_predict(X[:,:10])
# # centroids = kmeans.cluster_centers_
# # labels = kmeans.labels_
# # error = kmeans.inertia_
# y_10_100

for i in xrange(100):
    if pred[i]==1:
        print TextList[i]