In [1]:
%matplotlib inline
import networkx as nx
import csv
import re
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from networkx.algorithms.connectivity import minimum_st_edge_cut
from networkx.algorithms.flow import shortest_augmenting_path
from sklearn.cluster import KMeans
In [2]:
def L1 (x,y):
dist = 0
if len(x)==len(y):
for i in range(len(x)):
dist += math.fabs(x[i]-y[i])
return(dist)
else:
print('vectors must be equal length for L1')
return (Null)
In [3]:
i = 0
with open('training.1600000.processed.noemoticon.csv') as f_in:
for line in f_in:
print (list(csv.reader(line, skipinitialspace=True)))
print (line)
i+=1
if i>3:
break
In [14]:
"""0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
1 - the id of the tweet (2087)
2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)
3 - the query. If there is no query, then this value is NO_QUERY.
4 - the user that tweeted
5 - the text of the tweet"""
cols = ['polarity','tweetID','date','Query','UserID','text']
df = pd.read_csv('training.1600000.processed.noemoticon.csv',names=cols,encoding='latin-1')#names=m_cols ,
In [4]:
G=nx.Graph()
m=0
n=0
# with open('training.1600000.processed.noemoticon.csv', encoding='latin-1') as f_in:
with open('training.1600000.processed.noemoticon.csv') as f_in:
for line in f_in:
lineX = list(csv.reader(line, skipinitialspace=True))
G.add_node(lineX[8][0])
if '@' in lineX[10][0]:
m+=1
for t in re.split('[^a-zA-Z\_\@]', lineX[10][0]):
if t!='' and t[0]=='@' and t!='@':
G.add_edge(lineX[8][0],t[1:])
n+=1
if n%100000==0:
print(n)
print(nx.number_of_nodes(G))
In [6]:
print(nx.number_of_edges(G))
In [ ]:
In [7]:
"""0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
1 - the id of the tweet (2087)
2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)
3 - the query. If there is no query, then this value is NO_QUERY.
4 - the user that tweeted
5 - the text of the tweet"""
cols = ['polarity','tweetID','date','Query','UserID','text']
df = pd.read_csv('training.1600000.processed.noemoticon.csv',names=cols,encoding='latin-1')#names=m_cols ,
In [15]:
G=nx.Graph() # so that we dont destroy G if we start running this cell
m=0
n=0
for index, row in df.iterrows():
G.add_node(row[4])
if '@' in row[5]:
m+=1
for t in re.split('[^a-zA-Z\_\@]', row[5]):
if t!='' and t[0]=='@':
G.add_edge(row[4],t[1:])
n+=1
In [16]:
len(G)
Out[16]:
In [8]:
float(nx.number_of_edges(G))/float(nx.number_of_nodes(G))
Out[8]:
In [ ]:
In [16]:
DegList = list(nx.degree(G).values())
DegDic = {}
for D in DegList:
if D in DegDic:
DegDic[D] += 1
else:
DegDic[D] = 1
In [19]:
plt.yscale('log')
plt.ylabel('Log Count')
plt.title('Log plot of Degree Distribution of Graph')
plt.xscale('linear')
plt.xlabel('Degree')
plt.hist(DegList,bins=100)
Out[19]:
In [20]:
DegList = list(nx.degree(G).items())
for D in DegList:
if D[1]>3000:
print(D)
In [21]:
plt.title('Log-Log of Degree Distribution of Graph')
plt.ylabel('Log Count')
plt.xlabel('Log Degree')
DegList = sorted(DegDic.items())
Xlist, Ylist = zip(*DegList)
plt.loglog(Xlist,Ylist, basex=np.e, basey=np.e)
del Xlist
del Ylist
In [ ]:
In [27]:
del DegDic
del DegList
In [9]:
LargestCC = max(nx.connected_component_subgraphs(G), key=len) # largest connected component
print(nx.number_of_nodes(LargestCC))
In [120]:
LargestCC.remove_edges_from(LargestCC.selfloop_edges())
scaler = MinMaxScaler((50,800))
In [ ]:
CoreCounts = []
for i in range(2,10):
core_i = nx.k_core(LargestCC, i)
CoreCounts.append(nx.number_of_nodes(core_i))
del core_i
In [ ]:
plt.yscale('log')
plt.plot(range(2,10),CoreCounts)
In [37]:
#WOWWWWWWWW worth including
Out[37]:
In [10]:
core7 = nx.k_core(LargestCC,7)
In [11]:
Bcent = np.array(list(nx.betweenness_centrality(core7,normalized = True).values()))
scaledBC = scaler.fit_transform(Bcent)
In [12]:
Ecent = np.array(list(nx.eigenvector_centrality_numpy(core7).values()))
scaledEC = scaler.fit_transform(Ecent[:,np.newaxis])
In [ ]:
Ccent = np.array(list(nx.closeness_centrality(core7).values()))
scaledCC = scaler.fit_transform(Ccent[:,np.newaxis])
In [17]:
L1_dist = [L1(scaledCC,scaledBC),L1(scaledEC,scaledBC),L1(scaledEC,scaledCC)]
print("""From the three measures we have explored, Eigenvalue centality
Betweenness centality and Closeness centality. We can now evaluate the
L1 distance between the measures""")
D = L1_dist[0]
print("The L1 distance between Closeness centality and Betweenness Centrality is %d implying average distance of %f"%\
(D, D*1.0/nx.number_of_nodes(core7)))
D = L1_dist[1]
print("The L1 distance between Eigenvalue centality and Betweenness Centrality is %d implying average distance of %f"%\
(D, D*1.0/nx.number_of_nodes(core7)))
D = L1_dist[2]
print("The L1 distance between Closeness centality and Eigenvalue Centrality is %d implying average distance of %f"%\
(D, D*1.0/nx.number_of_nodes(core7)))
In [18]:
plt.yscale('log')
plt.hist(scaledBC)
Out[18]:
In [19]:
plt.yscale('log')
plt.hist(scaledEC)
Out[19]:
In [21]:
plt.yscale('log')
plt.hist(scaledCC)
Out[21]:
In [11]:
f = nx.fiedler_vector(core7)
s = np.zeros(len(f),dtype='int')
s[f>0]=1
In [12]:
colors = ['#d7191c', '#2b83ba']
node_colors = [colors[s[v]] for v in range(nx.number_of_nodes(core7))]
nx.draw(core7, node_color=node_colors,node_size=10)
In [55]:
L = nx.laplacian_matrix(core7).todense()
w, v = np.linalg.eig(L)
v = np.array(v)
worder = np.argsort(w)
#pos = {i: np.array([f[0], f[1]]) for i, f in enumerate(zip(v[:,worder[1]], v[:,worder[2]]))}
In [56]:
X = v @ np.diag(w)
X = X[:,worder]
In [57]:
error = np.zeros(9)
for k in range(2,11):
kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10)
kmeans.fit_predict(X[:,1:3])
error[k-2] = kmeans.inertia_
In [58]:
plt.plot(range(2,11),error)
Out[58]:
In [68]:
kmeans = KMeans(init='k-means++', n_clusters=6, n_init=10)
kmeans.fit_predict(X[:,1:3])
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_
In [70]:
colors = ['#d7191c', '#ffffbf', '#2b83ba', 'green','orange','maroon']
node_colors = [colors[labels[i]] for i in range(nx.number_of_nodes(core7))]
nx.draw(core7, node_color=node_colors,node_size=10)
In [23]:
import sklearn
print sklearn.__version__
In [18]:
import
from sklearn import mixture
mixture.GaussianMixture()
In [67]:
vectorizer = TfidfVectorizer(stop_words='english', min_df=8, max_df=0.8)
dtm = vectorizer.fit_transform(TextList)
del TextList
terms = vectorizer.get_feature_names()
print("Finished")
In [5]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv',names=cols,encoding='latin-1')#names=m_cols ,
In [25]:
prefix = './trainingandtestdata/'
testfile = prefix + 'testdata.manual.2009.06.14.csv'
trainfile = prefix + 'training.1600000.processed.noemoticon.csv'
In [27]:
df = pd.read_csv(trainfile,names=cols,encoding='latin-1')#names=m_cols ,
In [28]:
len(df)
Out[28]:
In [29]:
df.head()
Out[29]:
In [41]:
df_small = df.iloc[:600000]
In [31]:
len(df_small)
Out[31]:
In [44]:
#long
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', min_df=8,max_df=0.8)
M = vectorizer.fit_transform(df.text)
In [48]:
print type(M)
M
Out[48]:
In [55]:
#doesnt work well
# from scipy import io
# with open('M.mtx','w') as fout:
# io.mmwrite(fout, M)#, comment='', field=None, precision=None, symmetry=None)[source]
# io.mmwrite('M', M)
In [53]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50, n_iter=10, random_state=42)
X = svd.fit_transform(M)
In [139]:
svd.components_
Out[139]:
In [56]:
X.shape
Out[56]:
In [57]:
#Kmeans
# Clustering with some parameters.
from sklearn.cluster import KMeans
ncl = 8
k = 30
kmeans = KMeans(n_clusters=ncl, init='k-means++', max_iter=100, n_init=10,random_state=0)
y = kmeans.fit_predict(X[:,:k])
# centroids = kmeans.cluster_centers_
# labels = kmeans.labels_
# error = kmeans.inertia_
y
Out[57]:
In [ ]:
for tweets in df[:1000]
In [59]:
df_ = df.copy()
In [131]:
df_['class'] = y_30
In [138]:
df_[df_['class']==4]
Out[138]:
In [133]:
df_[df_['class']==5]
Out[133]:
In [ ]:
In [103]:
df_[df_['UserID']=='usagiko']
Out[103]:
In [130]:
classes = {}
for user in df_['UserID'].unique():
vals = {}
for i in df_[df_['UserID']==user]['class']:
if i!=1:
if i in vals:
vals[i]+=1
else:
vals[i]=1
try:
classes[user] = max(vals, key=stats.get)
except ValueError:
classes[user] = 1
In [ ]:
import statistics
from statistics import StatisticsError
import random
import math
# colors = ['#d7191c', '#ffffbf', '#2b83ba', 'green','orange','maroon','black']
user_class = []
for g in core7:
try:
try:
Id_Pred = df_[df_['UserID']==user]['class']
X = statistics.mode(ID_Pred[g])
node_colors.append(colors[X])
except StatisticsError:
node_colors.append(colors[ID_Pred[g][random.randint(0,len(ID_Pred[g])-1)]])
except KeyError:
node_colors.append(colors[6])
In [112]:
# df_[df_['UserID']=='ForzaRagazza']
vals = {}
for i in df_[df_['UserID']=='usagiko']['class']:
if i!=1:
if i in vals:
vals[i]+=1
else:
vals[i]=1
max(vals, key=stats.get)
Out[112]:
In [100]:
stats = {'a':2,'b':4,'c':1}
max(stats, key=stats.get)
Out[100]:
In [97]:
df_
Out[97]:
In [95]:
dfff = pd.DataFrame({'A': [1, 2, 1, 2, 1, 2, 3],'B': [5, 6, 5, 5, 5, 7, 6]})
# df.mode()
In [73]:
# df.columns#
df[u'UserID'].unique()
Out[73]:
In [66]:
#Kmeans
# Clustering with some parameters.
from sklearn.cluster import KMeans
ncl = 8
k = 30
kmeans_30 = KMeans(n_clusters=ncl, init='k-means++', max_iter=100, n_init=10,random_state=0)
y_30 = kmeans_30.fit_predict(X[:,:30])
# centroids = kmeans.cluster_centers_
# labels = kmeans.labels_
# error = kmeans.inertia_
y_30
Out[66]:
In [ ]:
# # GG=nx.Graph() # so that we dont destroy G if we start running this cell
# m=0
# n=0
# for index, row in df_.iterrows():
# G.add_node(row[4])
# if '@' in row[5]:
# m+=1
# for t in re.split('[^a-zA-Z\_\@]', row[5]):
# if t!='' and t[0]=='@':
# G.add_edge(row[4],t[1:])
# n+=1
In [127]:
colors = ['b','w','r','g','c','m','y','k']
# g : green.
# r : red.
# c : cyan.
# m : magenta.
# y : yellow.
# k : black.
# w : white.
In [125]:
i=0
labeled_nodes = []
for x in core7.nodes():
labeled_nodes.append( (x, 1) )
try:
labeled_nodes[i]=( (x,classes[x]) )
except KeyError:
pass
i+=1
In [ ]:
labeled
In [129]:
fig = plt.figure(figsize=(12,6))
ax = plt.subplot(111)
# labeled_nodes = [(x,classes[x]) for x in core7.nodes()]
# which = np.random.choice(range(len(labeled_nodes)),500)
which = range(len(core7))
# mini_g = core7.subgraph([labeled_nodes[i][0] for i in which])
mini_g = core7
node_colors = [colors[labeled_nodes[i][1]] for i in which]
nx.draw(mini_g, node_color=node_colors,node_size=100, ax=ax, with_labels='False',
alpha =0.2, font_size=0,width=0.1)
# nx.draw(Gc_core,nodelist=Gc_core.nodes()[:100], node_color=node_colors,node_size=100, ax=ax, with_labels='False',
# alpha =0.2, font_size=0,width=0.1)
# nx.draw(Gc_core, node_color=node_colors,node_size=10, ax=ax, with_labels='True', font_size=16)
In [74]:
type(G.nodes()[0])
Out[74]:
In [79]:
G.nodes()[1]
Out[79]:
In [107]:
list(y_30).count(0)
Out[107]:
In [108]:
list(y_30).count(1)
Out[108]:
In [110]:
for i in xrange(ncl):
print list(y_30).count(i)
In [111]:
# #Kmeans
# # Clustering with some parameters.
# from sklearn.cluster import KMeans
# ncl = 20
# k = 10
# kmeans_10_100 = KMeans(n_clusters=ncl, init='k-means++', max_iter=100, n_init=10,random_state=0)
# y_10_100 = kmeans_10_100.fit_predict(X[:,:10])
# # centroids = kmeans.cluster_centers_
# # labels = kmeans.labels_
# # error = kmeans.inertia_
# y_10_100
In [ ]:
for i in xrange(100):
if pred[i]==1:
print TextList[i]