``````

In [1]:

%matplotlib inline
import networkx as nx
import csv
import re
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from networkx.algorithms.connectivity import minimum_st_edge_cut
from networkx.algorithms.flow import shortest_augmenting_path
from sklearn.cluster import KMeans

``````
``````

In [2]:

def L1 (x,y):
dist = 0
if len(x)==len(y):
for i in range(len(x)):
dist += math.fabs(x[i]-y[i])
return(dist)
else:
print('vectors must be equal length for L1')
return (Null)

``````
``````

In [3]:

i = 0

with open('training.1600000.processed.noemoticon.csv') as f_in:
for line in f_in:
print (list(csv.reader(line, skipinitialspace=True)))
print (line)
i+=1
if i>3:
break

``````
``````

[['0'], ['', ''], ['1467810369'], ['', ''], ['Mon Apr 06 22:19:45 PDT 2009'], ['', ''], ['NO_QUERY'], ['', ''], ['_TheSpecialOne_'], ['', ''], ["@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"], []]
"0","1467810369","Mon Apr 06 22:19:45 PDT 2009","NO_QUERY","_TheSpecialOne_","@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

[['0'], ['', ''], ['1467810672'], ['', ''], ['Mon Apr 06 22:19:49 PDT 2009'], ['', ''], ['NO_QUERY'], ['', ''], ['scotthamilton'], ['', ''], ["is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!"], []]
"0","1467810672","Mon Apr 06 22:19:49 PDT 2009","NO_QUERY","scotthamilton","is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!"

[['0'], ['', ''], ['1467810917'], ['', ''], ['Mon Apr 06 22:19:53 PDT 2009'], ['', ''], ['NO_QUERY'], ['', ''], ['mattycus'], ['', ''], ['@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds'], []]
"0","1467810917","Mon Apr 06 22:19:53 PDT 2009","NO_QUERY","mattycus","@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds"

[['0'], ['', ''], ['1467811184'], ['', ''], ['Mon Apr 06 22:19:57 PDT 2009'], ['', ''], ['NO_QUERY'], ['', ''], ['ElleCTF'], ['', ''], ['my whole body feels itchy and like its on fire '], []]
"0","1467811184","Mon Apr 06 22:19:57 PDT 2009","NO_QUERY","ElleCTF","my whole body feels itchy and like its on fire "

``````
``````

In [14]:

"""0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
1 - the id of the tweet (2087)
2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)
3 - the query. If there is no query, then this value is NO_QUERY.
4 - the user that tweeted
5 - the text of the tweet"""
cols = ['polarity','tweetID','date','Query','UserID','text']
df = pd.read_csv('training.1600000.processed.noemoticon.csv',names=cols,encoding='latin-1')#names=m_cols ,

``````
``````

In [4]:

G=nx.Graph()
m=0
n=0

# with open('training.1600000.processed.noemoticon.csv', encoding='latin-1') as f_in:
with open('training.1600000.processed.noemoticon.csv') as f_in:
for line in f_in:
lineX = list(csv.reader(line, skipinitialspace=True))
G.add_node(lineX[8][0])
if '@' in lineX[10][0]:
m+=1
for t in re.split('[^a-zA-Z\_\@]', lineX[10][0]):
if t!='' and t[0]=='@' and t!='@':
G.add_edge(lineX[8][0],t[1:])
n+=1
if n%100000==0:
print(n)
print(nx.number_of_nodes(G))

``````
``````

100000
100000
100000
100000
100000
100000
100000
200000
200000
300000
300000
400000
500000
600000
700000
889334

``````
``````

In [6]:

print(nx.number_of_edges(G))

``````
``````

616462

``````
``````

In [ ]:

``````
``````

In [7]:

"""0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
1 - the id of the tweet (2087)
2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)
3 - the query. If there is no query, then this value is NO_QUERY.
4 - the user that tweeted
5 - the text of the tweet"""
cols = ['polarity','tweetID','date','Query','UserID','text']
df = pd.read_csv('training.1600000.processed.noemoticon.csv',names=cols,encoding='latin-1')#names=m_cols ,

``````
``````

In [15]:

G=nx.Graph() # so that we dont destroy G if we start running this cell

m=0
n=0

for index, row in df.iterrows():
G.add_node(row[4])
if '@' in row[5]:
m+=1
for t in re.split('[^a-zA-Z\_\@]', row[5]):
if t!='' and t[0]=='@':
G.add_edge(row[4],t[1:])
n+=1

``````
``````

In [16]:

len(G)

``````
``````

Out[16]:

889335

``````
``````

In [8]:

float(nx.number_of_edges(G))/float(nx.number_of_nodes(G))

``````
``````

Out[8]:

0.6931726437986179

``````
``````

In [ ]:

``````
``````

In [16]:

DegList = list(nx.degree(G).values())
DegDic = {}
for D in DegList:
if D in DegDic:
DegDic[D] += 1
else:
DegDic[D] = 1

``````
``````

In [19]:

plt.yscale('log')
plt.ylabel('Log Count')
plt.title('Log plot of Degree Distribution of Graph')
plt.xscale('linear')
plt.xlabel('Degree')
plt.hist(DegList,bins=100)

``````
``````

Out[19]:

(array([  8.88118000e+05,   8.90000000e+02,   1.70000000e+02,
5.80000000e+01,   2.70000000e+01,   1.40000000e+01,
9.00000000e+00,   5.00000000e+00,   1.30000000e+01,
4.00000000e+00,   6.00000000e+00,   2.00000000e+00,
2.00000000e+00,   0.00000000e+00,   0.00000000e+00,
2.00000000e+00,   0.00000000e+00,   2.00000000e+00,
1.00000000e+00,   0.00000000e+00,   2.00000000e+00,
0.00000000e+00,   0.00000000e+00,   2.00000000e+00,
1.00000000e+00,   1.00000000e+00,   1.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
1.00000000e+00]),
array([    0. ,    34.5,    69. ,   103.5,   138. ,   172.5,   207. ,
241.5,   276. ,   310.5,   345. ,   379.5,   414. ,   448.5,
483. ,   517.5,   552. ,   586.5,   621. ,   655.5,   690. ,
724.5,   759. ,   793.5,   828. ,   862.5,   897. ,   931.5,
966. ,  1000.5,  1035. ,  1069.5,  1104. ,  1138.5,  1173. ,
1207.5,  1242. ,  1276.5,  1311. ,  1345.5,  1380. ,  1414.5,
1449. ,  1483.5,  1518. ,  1552.5,  1587. ,  1621.5,  1656. ,
1690.5,  1725. ,  1759.5,  1794. ,  1828.5,  1863. ,  1897.5,
1932. ,  1966.5,  2001. ,  2035.5,  2070. ,  2104.5,  2139. ,
2173.5,  2208. ,  2242.5,  2277. ,  2311.5,  2346. ,  2380.5,
2415. ,  2449.5,  2484. ,  2518.5,  2553. ,  2587.5,  2622. ,
2656.5,  2691. ,  2725.5,  2760. ,  2794.5,  2829. ,  2863.5,
2898. ,  2932.5,  2967. ,  3001.5,  3036. ,  3070.5,  3105. ,
3139.5,  3174. ,  3208.5,  3243. ,  3277.5,  3312. ,  3346.5,
3381. ,  3415.5,  3450. ]),
<a list of 100 Patch objects>)

``````
``````

In [20]:

DegList = list(nx.degree(G).items())
for D in DegList:
if D[1]>3000:
print(D)

``````
``````

('mileycyrus', 3450)

``````
``````

In [21]:

plt.title('Log-Log of Degree Distribution of Graph')
plt.ylabel('Log Count')
plt.xlabel('Log Degree')
DegList = sorted(DegDic.items())
Xlist, Ylist = zip(*DegList)
plt.loglog(Xlist,Ylist, basex=np.e, basey=np.e)
del Xlist
del Ylist

``````
``````

``````
``````

In [ ]:

``````
``````

In [27]:

del DegDic
del DegList

``````
``````

In [9]:

LargestCC = max(nx.connected_component_subgraphs(G), key=len) # largest connected component
print(nx.number_of_nodes(LargestCC))

``````
``````

339766

``````
``````

In [120]:

LargestCC.remove_edges_from(LargestCC.selfloop_edges())
scaler = MinMaxScaler((50,800))

``````
``````

In [ ]:

CoreCounts = []
for i in range(2,10):
core_i = nx.k_core(LargestCC, i)
CoreCounts.append(nx.number_of_nodes(core_i))
del core_i

``````
``````

In [ ]:

plt.yscale('log')
plt.plot(range(2,10),CoreCounts)

``````
``````

In [37]:

#WOWWWWWWWW worth including

``````
``````

Out[37]:

[<matplotlib.lines.Line2D at 0x7f8727360208>]

``````
``````

In [10]:

core7 = nx.k_core(LargestCC,7)

``````
``````

---------------------------------------------------------------------------
NetworkXError                             Traceback (most recent call last)
<ipython-input-10-fb53b157c762> in <module>()
----> 1 core7 = nx.k_core(LargestCC,7)

/Users/km/anaconda/lib/python2.7/site-packages/networkx/algorithms/core.pyc in k_core(G, k, core_number)
151     """
152     if core_number is None:
--> 153         core_number=nx.core_number(G)
154     if k is None:
155         k=max(core_number.values()) # max core

/Users/km/anaconda/lib/python2.7/site-packages/networkx/algorithms/core.pyc in core_number(G)
68         raise nx.NetworkXError(
69                 'Input graph has self loops; the core number is not defined.',
---> 70                 'Consider using G.remove_edges_from(G.selfloop_edges()).')
71
72     if G.is_directed():

NetworkXError: ('Input graph has self loops; the core number is not defined.', 'Consider using G.remove_edges_from(G.selfloop_edges()).')

``````
``````

In [11]:

Bcent = np.array(list(nx.betweenness_centrality(core7,normalized = True).values()))
scaledBC = scaler.fit_transform(Bcent)

``````
``````

/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/data.py:321: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/data.py:356: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)

``````
``````

In [12]:

Ecent = np.array(list(nx.eigenvector_centrality_numpy(core7).values()))
scaledEC = scaler.fit_transform(Ecent[:,np.newaxis])

``````
``````

In [ ]:

Ccent = np.array(list(nx.closeness_centrality(core7).values()))
scaledCC = scaler.fit_transform(Ccent[:,np.newaxis])

``````
``````

In [17]:

L1_dist = [L1(scaledCC,scaledBC),L1(scaledEC,scaledBC),L1(scaledEC,scaledCC)]
print("""From the three measures we have explored, Eigenvalue centality
Betweenness centality and Closeness centality. We can now evaluate the
L1 distance between the measures""")
D = L1_dist[0]
print("The L1 distance between Closeness centality and Betweenness Centrality is %d implying average distance of %f"%\
(D, D*1.0/nx.number_of_nodes(core7)))
D = L1_dist[1]
print("The L1 distance between Eigenvalue centality and Betweenness Centrality is %d implying average distance of %f"%\
(D, D*1.0/nx.number_of_nodes(core7)))
D = L1_dist[2]
print("The L1 distance between Closeness centality and Eigenvalue Centrality is %d implying average distance of %f"%\
(D, D*1.0/nx.number_of_nodes(core7)))

``````
``````

From the three measures we have explored, Eigenvalue centality
Betweenness centality and Closeness centality. We can now evaluate the
L1 distance between the measures
The L1 distance between Closeness centality and Betweenness Centrality is 386829 implying average distance of 391.527373
The L1 distance between Eigenvalue centality and Betweenness Centrality is 45735 implying average distance of 46.290495
The L1 distance between Closeness centality and Eigenvalue Centrality is 371185 implying average distance of 375.693507

``````
``````

In [18]:

plt.yscale('log')
plt.hist(scaledBC)

``````
``````

Out[18]:

(array([ 929.,   19.,   23.,    5.,    6.,    2.,    2.,    0.,    1.,    1.]),
array([  50.,  125.,  200.,  275.,  350.,  425.,  500.,  575.,  650.,
725.,  800.]),
<a list of 10 Patch objects>)

``````
``````

In [19]:

plt.yscale('log')
plt.hist(scaledEC)

``````
``````

Out[19]:

(array([ 780.,  146.,   48.,    7.,    0.,    2.,    0.,    2.,    1.,    2.]),
array([  50.,  125.,  200.,  275.,  350.,  425.,  500.,  575.,  650.,
725.,  800.]),
<a list of 10 Patch objects>)

``````
``````

In [21]:

plt.yscale('log')
plt.hist(scaledCC)

``````
``````

Out[21]:

(array([  11.,   29.,   31.,   57.,  175.,  252.,  332.,   71.,   21.,    9.]),
array([  50.,  125.,  200.,  275.,  350.,  425.,  500.,  575.,  650.,
725.,  800.]),
<a list of 10 Patch objects>)

``````
``````

In [11]:

f = nx.fiedler_vector(core7)
s = np.zeros(len(f),dtype='int')
s[f>0]=1

``````
``````

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-a697dcd2cdf7> in <module>()
----> 1 f = nx.fiedler_vector(core7)
2 s = np.zeros(len(f),dtype='int')
3 s[f>0]=1

NameError: name 'core7' is not defined

``````
``````

In [12]:

colors = ['#d7191c', '#2b83ba']
node_colors = [colors[s[v]] for v in range(nx.number_of_nodes(core7))]
nx.draw(core7, node_color=node_colors,node_size=10)

``````
``````

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-495c7a8b012f> in <module>()
1 colors = ['#d7191c', '#2b83ba']
----> 2 node_colors = [colors[s[v]] for v in range(nx.number_of_nodes(core7))]
3 nx.draw(core7, node_color=node_colors,node_size=10)

NameError: name 'core7' is not defined

``````
``````

In [55]:

L = nx.laplacian_matrix(core7).todense()
w, v = np.linalg.eig(L)
v = np.array(v)
worder = np.argsort(w)
#pos = {i: np.array([f[0], f[1]]) for i, f in enumerate(zip(v[:,worder[1]], v[:,worder[2]]))}

``````
``````

In [56]:

X = v @ np.diag(w)
X = X[:,worder]

``````
``````

In [57]:

error = np.zeros(9)
for k in range(2,11):
kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10)
kmeans.fit_predict(X[:,1:3])
error[k-2] = kmeans.inertia_

``````
``````

In [58]:

plt.plot(range(2,11),error)

``````
``````

Out[58]:

[<matplotlib.lines.Line2D at 0x7f61c24a39b0>]

``````
``````

In [68]:

kmeans = KMeans(init='k-means++', n_clusters=6, n_init=10)
kmeans.fit_predict(X[:,1:3])
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_

``````
``````

In [70]:

colors = ['#d7191c', '#ffffbf', '#2b83ba', 'green','orange','maroon']
node_colors = [colors[labels[i]] for i in range(nx.number_of_nodes(core7))]
nx.draw(core7, node_color=node_colors,node_size=10)

``````
``````

/home/ubuntu/anaconda3/lib/python3.6/site-packages/networkx/drawing/nx_pylab.py:126: MatplotlibDeprecationWarning: pyplot.hold is deprecated.
Future behavior will be consistent with the long-time default:
plot commands add elements without first clearing the
Axes and/or Figure.
b = plt.ishold()
/home/ubuntu/anaconda3/lib/python3.6/site-packages/networkx/drawing/nx_pylab.py:138: MatplotlibDeprecationWarning: pyplot.hold is deprecated.
Future behavior will be consistent with the long-time default:
plot commands add elements without first clearing the
Axes and/or Figure.
plt.hold(b)
/home/ubuntu/anaconda3/lib/python3.6/site-packages/matplotlib/__init__.py:917: UserWarning: axes.hold is deprecated. Please remove it from your matplotlibrc and/or style files.
warnings.warn(self.msg_depr_set % key)
/home/ubuntu/anaconda3/lib/python3.6/site-packages/matplotlib/rcsetup.py:152: UserWarning: axes.hold is deprecated, will be removed in 3.0
warnings.warn("axes.hold is deprecated, will be removed in 3.0")

``````
``````

In [23]:

import sklearn
print sklearn.__version__

``````
``````

0.17.1

``````
``````

In [18]:

import
from sklearn import mixture

mixture.GaussianMixture()

``````
``````

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-18-a1a84fb1c0cf> in <module>()
2 from sklearn import mixture
3
----> 4 mixture.GaussianMixture()

AttributeError: 'module' object has no attribute 'GaussianMixture'

``````
``````

In [67]:

vectorizer = TfidfVectorizer(stop_words='english', min_df=8, max_df=0.8)
dtm = vectorizer.fit_transform(TextList)

del TextList
terms = vectorizer.get_feature_names()
print("Finished")

``````
``````

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-67-b0bc613432c5> in <module>()
----> 1 vectorizer = TfidfVectorizer(stop_words='english', min_df=8, max_df=0.8)
2 dtm = vectorizer.fit_transform(TextList)
3
4 del TextList
5 terms = vectorizer.get_feature_names()

NameError: name 'TfidfVectorizer' is not defined

``````
``````

In [5]:

df = pd.read_csv('training.1600000.processed.noemoticon.csv',names=cols,encoding='latin-1')#names=m_cols ,

``````
``````

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-7153293e4f30> in <module>()
----> 1 df = pd.read_csv('training.1600000.processed.noemoticon.csv',names=cols,encoding='latin-1')#names=m_cols ,

NameError: name 'cols' is not defined

``````
``````

In [25]:

prefix = './trainingandtestdata/'

testfile = prefix + 'testdata.manual.2009.06.14.csv'
trainfile = prefix + 'training.1600000.processed.noemoticon.csv'

``````
``````

In [27]:

df = pd.read_csv(trainfile,names=cols,encoding='latin-1')#names=m_cols ,

``````
``````

In [28]:

len(df)

``````
``````

Out[28]:

1600000

``````
``````

In [29]:

df.head()

``````
``````

Out[29]:

polarity
tweetID
date
Query
UserID
text

0
0
1467810369
Mon Apr 06 22:19:45 PDT 2009
NO_QUERY
_TheSpecialOne_
@switchfoot http://twitpic.com/2y1zl - Awww, t...

1
0
1467810672
Mon Apr 06 22:19:49 PDT 2009
NO_QUERY
scotthamilton
is upset that he can't update his Facebook by ...

2
0
1467810917
Mon Apr 06 22:19:53 PDT 2009
NO_QUERY
mattycus
@Kenichan I dived many times for the ball. Man...

3
0
1467811184
Mon Apr 06 22:19:57 PDT 2009
NO_QUERY
ElleCTF
my whole body feels itchy and like its on fire

4
0
1467811193
Mon Apr 06 22:19:57 PDT 2009
NO_QUERY
Karoli
@nationwideclass no, it's not behaving at all....

``````
``````

In [41]:

df_small = df.iloc[:600000]

``````
``````

In [31]:

len(df_small)

``````
``````

Out[31]:

1000

``````
``````

In [44]:

#long
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', min_df=8,max_df=0.8)
M = vectorizer.fit_transform(df.text)

``````
``````

In [48]:

print type(M)
M

``````
``````

<class 'scipy.sparse.csr.csr_matrix'>

Out[48]:

<1600000x52521 sparse matrix of type '<type 'numpy.float64'>'
with 10140434 stored elements in Compressed Sparse Row format>

``````
``````

In [55]:

#doesnt work well

# from scipy import io

# with open('M.mtx','w') as fout:
#     io.mmwrite(fout, M)#, comment='', field=None, precision=None, symmetry=None)[source]

# io.mmwrite('M', M)

``````
``````

In [53]:

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50, n_iter=10, random_state=42)
X = svd.fit_transform(M)

``````
``````

In [139]:

svd.components_

``````
``````

Out[139]:

array([[  2.72001580e-03,   1.69149426e-03,   1.09716343e-04, ...,
2.72950482e-05,   3.23040176e-06,   5.18766711e-06],
[ -4.29279707e-04,   1.22860704e-03,   5.74032116e-05, ...,
3.49286375e-05,   7.78247908e-06,   1.07673030e-05],
[ -6.20510551e-04,  -2.94362336e-04,  -4.71793942e-05, ...,
3.79566391e-06,  -8.31496087e-08,   3.20352253e-06],
...,
[ -1.17122879e-04,  -6.05301438e-05,   1.80451782e-05, ...,
-1.61457500e-05,   8.85458117e-07,  -1.28697769e-06],
[ -5.88878369e-04,  -3.37554068e-04,   7.58171236e-05, ...,
1.73194680e-06,   4.35612975e-07,  -2.37956566e-06],
[ -8.27196817e-04,  -2.17366492e-04,  -2.30040004e-05, ...,
-3.65960547e-05,   1.45690214e-06,  -1.99006957e-06]])

``````
``````

In [56]:

X.shape

``````
``````

Out[56]:

(1600000, 50)

``````
``````

In [57]:

#Kmeans

# Clustering with some parameters.

from sklearn.cluster import KMeans
ncl = 8
k = 30
kmeans = KMeans(n_clusters=ncl, init='k-means++', max_iter=100, n_init=10,random_state=0)
y = kmeans.fit_predict(X[:,:k])
# centroids = kmeans.cluster_centers_
# labels = kmeans.labels_
# error = kmeans.inertia_
y

``````
``````

Out[57]:

array([6, 7, 0, ..., 0, 7, 0], dtype=int32)

``````
``````

In [ ]:

for tweets in df[:1000]

``````
``````

In [59]:

df_ = df.copy()

``````
``````

In [131]:

df_['class'] = y_30

``````
``````

In [138]:

df_[df_['class']==4]

``````
``````

Out[138]:

polarity
tweetID
date
Query
UserID
text
class

103
0
1467837470
Mon Apr 06 22:26:43 PDT 2009
NO_QUERY
annette414
watching &quot;House&quot;
4

119
0
1467839586
Mon Apr 06 22:27:18 PDT 2009
NO_QUERY
sonyolmos
@eRRe_sC aaw i miss ya all too.. im leaving to...
4

124
0
1467840552
Mon Apr 06 22:27:34 PDT 2009
NO_QUERY
weefranniev
Late night snack, glass of OJ b/c I'm &quot;do...
4

183
0
1467858363
Mon Apr 06 22:32:12 PDT 2009
NO_QUERY
schammy
Downloading NIN's new album &quot;the slip&quo...
4

206
0
1467862710
Mon Apr 06 22:33:20 PDT 2009
NO_QUERY
Jemimus
My mind and body are severely protesting this ...
4

213
0
1467863684
Mon Apr 06 22:33:35 PDT 2009
NO_QUERY
DjGundam
Awwh babs... you look so sad underneith that s...
4

286
0
1467881920
Mon Apr 06 22:38:28 PDT 2009
NO_QUERY
mumu1210
FML: So much for seniority, bc of technologica...
4

319
0
1467894593
Mon Apr 06 22:41:52 PDT 2009
NO_QUERY
Tanja71
@JonathanRKnight Oh! Did I mention it? &quot;G...
4

393
0
1467911624
Mon Apr 06 22:46:32 PDT 2009
NO_QUERY
Mati_UOIT
Sitting here wondering why &quot;ED&quot; stil...
4

400
0
1467913111
Mon Apr 06 22:46:57 PDT 2009
NO_QUERY
nssmom
#3 woke up and was having an accident - &quot;...
4

473
0
1467931070
Mon Apr 06 22:52:06 PDT 2009
NO_QUERY
calee01
&quot;On popular music&quot; by T.W.Adorno is ...
4

492
0
1467934004
Mon Apr 06 22:52:56 PDT 2009
NO_QUERY
malice_sin
pears &amp; Brie, bottle of Cabernet, and &quo...
4

510
0
1467943007
Mon Apr 06 22:55:30 PDT 2009
NO_QUERY
vibratoria
@stuiy never again will I click on a link that...
4

569
0
1467952985
Mon Apr 06 22:58:24 PDT 2009
NO_QUERY
mickeyness
@daniela_95616 hahaa!! i just realized &quot;i...
4

592
0
1467962336
Mon Apr 06 23:00:55 PDT 2009
NO_QUERY
umfoo
my heart is broken every morning dropping Foo ...
4

725
0
1467992696
Mon Apr 06 23:09:35 PDT 2009
NO_QUERY
pop_corn_
I feel like I am the only &quot;twitterer&quot...
4

742
0
1467996096
Mon Apr 06 23:10:33 PDT 2009
NO_QUERY
Beatmixology
@djsoulsister yeah, great vid. I had the 12&qu...
4

747
0
1467998037
Mon Apr 06 23:11:06 PDT 2009
NO_QUERY
amitgupta
Have an invite for &quot;Healthy Dining&quot; ...
4

817
0
1468013020
Mon Apr 06 23:15:30 PDT 2009
NO_QUERY
melvinchia
has a mild left inner ear infection.. and its ...
4

879
0
1468031882
Mon Apr 06 23:21:15 PDT 2009
NO_QUERY
NesbyPhips
So Im done editing &quot;The Phipstape&quot;. ...
4

905
0
1468035396
Mon Apr 06 23:22:24 PDT 2009
NO_QUERY
noslennai
Watching &quot;a league of their own&quot;...m...
4

979
0
1468052239
Mon Apr 06 23:27:43 PDT 2009
NO_QUERY
ConstanceClark
@MarcusMims wow i didn't get an &quot;hello&qu...
4

987
0
1468053198
Mon Apr 06 23:28:02 PDT 2009
NO_QUERY
grinc
Reading Buyology before bedtime... great premi...
4

990
0
1468053804
Mon Apr 06 23:28:13 PDT 2009
NO_QUERY
princeralph
omg... &quot;The Reader&quot; is making me
4

1044
0
1468068303
Mon Apr 06 23:32:33 PDT 2009
NO_QUERY
rob_fitzpatrick
@gracedent it's her &quot;hair&quot; I can't d...
4

1048
0
1468068726
Mon Apr 06 23:32:42 PDT 2009
NO_QUERY
verruca
Why isn't there a &quot;fake&quot; Verruca on ...
4

1071
0
1468074597
Mon Apr 06 23:34:32 PDT 2009
NO_QUERY
PeterHC
Used the term &quot;Fail Whale&quot; to a clie...
4

1086
0
1468076942
Mon Apr 06 23:35:15 PDT 2009
NO_QUERY
geekpondering
@transbay &quot;SFMTA Budget Proposal Hearing:...
4

1104
0
1468080524
Mon Apr 06 23:36:22 PDT 2009
NO_QUERY
Msjackson88
still no &quot;followers&quot;   please some1 ...
4

1204
0
1468107293
Mon Apr 06 23:45:12 PDT 2009
NO_QUERY
norawatkins
VIP guests today -________-&quot;   blohheeee ...
4

...
...
...
...
...
...
...
...

1598559
4
2193190901
Tue Jun 16 08:07:11 PDT 2009
NO_QUERY
MsSophieBelle
@KempEquine nothing wrong  Mom has one for me(...
4

1598567
4
2193220591
Tue Jun 16 08:09:39 PDT 2009
NO_QUERY
katyhelena
@o0omunkieo0o Thanks! I think so too. I call h...
4

1598655
4
2193224813
Tue Jun 16 08:10:00 PDT 2009
NO_QUERY
JanayS
&quot;i wake up it's a bad dream no one on my ...
4

1598668
4
2193251846
Tue Jun 16 08:12:11 PDT 2009
NO_QUERY
RunWithForest
@explosivityy I'm writing now.  Di ko na sinas...
4

1598718
4
2193254120
Tue Jun 16 08:12:22 PDT 2009
NO_QUERY
christicox
I asked a 3yr old how old I was &amp; he said,...
4

1598749
4
2193255731
Tue Jun 16 08:12:31 PDT 2009
NO_QUERY
jointuletz
am plecat la &quot;la comedie&quot;...v-astept...
4

1598770
4
2193277168
Tue Jun 16 08:14:18 PDT 2009
NO_QUERY
peachtweet
@JKL_Katie omg, i know. it's annoying! they're...
4

1598938
4
2193306490
Tue Jun 16 08:16:41 PDT 2009
NO_QUERY
Emm_Jay
@Twyst  That would be great... &quot;looking f...
4

1598969
4
2193319050
Tue Jun 16 08:17:43 PDT 2009
NO_QUERY
howardsublett
@dougalcorn: It is actually just @daveminor wa...
4

1599053
4
2193343042
Tue Jun 16 08:19:44 PDT 2009
NO_QUERY
CardboxDiva
@thejoshlynn You are! BTW send me an email. Wa...
4

1599188
4
2193371978
Tue Jun 16 08:22:04 PDT 2009
NO_QUERY
cunderwo
@meganlm i rented &quot;dead silence&quot; at ...
4

1599264
4
2193402232
Tue Jun 16 08:24:36 PDT 2009
NO_QUERY
krystlerb
@TANGG GT was a good movie...although I spent ...
4

1599337
4
2193426650
Tue Jun 16 08:26:37 PDT 2009
NO_QUERY
LucasSchmitt
@maryzlane that sucks. :/ i guess you have to ...
4

1599410
4
2193428898
Tue Jun 16 08:26:48 PDT 2009
NO_QUERY
caileighamazing
&quot;Wow, What A Tight Fit&quot; Lmao, Shutup.
4

1599418
4
2193429153
Tue Jun 16 08:26:49 PDT 2009
NO_QUERY
erikalanzer19
@brendanlover11 I really love &quot;Starlight&...
4

1599424
4
2193450427
Tue Jun 16 08:28:32 PDT 2009
NO_QUERY
HelloGracey
Exploring the world of Twitter   Listening to ...
4

1599460
4
2193452390
Tue Jun 16 08:28:42 PDT 2009
NO_QUERY
comradephil
@aw16 I must have skipped the &quot;pun&quot; ...
4

1599478
4
2193453100
Tue Jun 16 08:28:45 PDT 2009
NO_QUERY
jgunzz
Why don't we name tomorrow &quot;the Official ...
4

1599557
4
2193475600
Tue Jun 16 08:30:33 PDT 2009
NO_QUERY
johnbertr
@chinkchilla there's this brilliant add-on for...
4

1599612
4
2193478589
Tue Jun 16 08:30:47 PDT 2009
NO_QUERY
teotarafas
@InesVargas 17&quot;?! Hope you dont plan on t...
4

1599615
4
2193478782
Tue Jun 16 08:30:48 PDT 2009
NO_QUERY
MilanTeh
@brothrsaw The count would lead to shutter dea...
4

1599669
4
2193503005
Tue Jun 16 08:32:45 PDT 2009
NO_QUERY
jtsosnowski
You know there's way too much going on when yo...
4

1599682
4
2193503503
Tue Jun 16 08:32:48 PDT 2009
NO_QUERY
oscarbarber
@perequintana ara sÃ­ que ets tot un &quot;pir...
4

1599702
4
2193504328
Tue Jun 16 08:32:52 PDT 2009
NO_QUERY
SweetTartelette
@ang_w It's been forever since I have had &quo...
4

1599802
4
2193529779
Tue Jun 16 08:34:56 PDT 2009
NO_QUERY
maeubhminor
woooo elliot minor in 8 days!!! and i got my E...
4

1599835
4
2193551788
Tue Jun 16 08:36:44 PDT 2009
NO_QUERY
annamadeleine
@alexandervelky that's polite version - i only...
4

1599840
4
2193552024
Tue Jun 16 08:36:44 PDT 2009
NO_QUERY
bdottie
What a pretty day  &quot;Just smile&quot;
4

1599946
4
2193577228
Tue Jun 16 08:38:49 PDT 2009
NO_QUERY
Sirley
@chriscuzzy someone wanted a &quot;Cuzzy&quot;...
4

1599977
4
2193578386
Tue Jun 16 08:38:55 PDT 2009
NO_QUERY
TeamUKskyvixen
@MayorDorisWolfe Thats my girl - dishing out t...
4

1599985
4
2193578982
Tue Jun 16 08:38:58 PDT 2009
NO_QUERY
LISKFEST
if ur the lead singer in a band, beware fallin...
4

33136 rows × 7 columns

``````
``````

In [133]:

df_[df_['class']==5]

``````
``````

Out[133]:

polarity
tweetID
date
Query
UserID
text
class

99
0
1467836859
Mon Apr 06 22:26:33 PDT 2009
NO_QUERY
willy_chaz
A bad nite for the favorite teams: Astros and ...
5

122
0
1467840016
Mon Apr 06 22:27:25 PDT 2009
NO_QUERY
BustaBusta
I know my life has been flipped upside down wh...
5

160
0
1467853356
Mon Apr 06 22:30:54 PDT 2009
NO_QUERY
dbmendel
Picked Mich St to win it all from the get go. ...
5

178
0
1467857297
Mon Apr 06 22:31:56 PDT 2009
NO_QUERY
amanda5280
Today I realized I am too good at hiding thing...
5

192
0
1467859820
Mon Apr 06 22:32:36 PDT 2009
NO_QUERY
msbutt3rfly14
spencer is not a good guy.
5

219
0
1467871040
Mon Apr 06 22:35:31 PDT 2009
NO_QUERY
MTLarson1224
@DonnieWahlberg I hope i can make it to the au...
5

222
0
1467871545
Mon Apr 06 22:35:40 PDT 2009
NO_QUERY
Cherye101
@PaulaAbdul awww, Good luck Paula!! Please don...
5

290
0
1467882902
Mon Apr 06 22:38:44 PDT 2009
NO_QUERY
usagiko
@LevenRambin: Take it easy, and be good to you.
5

298
0
1467889791
Mon Apr 06 22:40:33 PDT 2009
NO_QUERY
jennhelvering
Just called Hillsong again - they said they co...
5

320
0
1467894600
Mon Apr 06 22:41:51 PDT 2009
NO_QUERY
dreaaa
throat is closing up and i had some string che...
5

348
0
1467899025
Mon Apr 06 22:43:06 PDT 2009
NO_QUERY
oup
still sick. feeling a bit better, got some new...
5

441
0
1467924690
Mon Apr 06 22:50:17 PDT 2009
NO_QUERY
FlyRice
Good GOD they ruined my belly button!!!
5

444
0
1467925657
Mon Apr 06 22:50:34 PDT 2009
NO_QUERY
aisyahsamsudin
running nose + spinning head = not a good comb...
5

470
0
1467930341
Mon Apr 06 22:51:53 PDT 2009
NO_QUERY
GemDoughnut
MORNING!!! Good im bloody knackered!!! Work is...
5

475
0
1467931501
Mon Apr 06 22:52:13 PDT 2009
NO_QUERY
soulonfire68
We've been good. I'm not liking the snow right...
5

508
0
1467937402
Mon Apr 06 22:53:55 PDT 2009
NO_QUERY
haunter_
@Houndour ...i wish i was there...i'm pretty g...
5

532
0
1467947005
Mon Apr 06 22:56:40 PDT 2009
NO_QUERY
tamisara
Good morning! Ready 2 go, but I want 2 go back...
5

567
0
1467952123
Mon Apr 06 22:58:08 PDT 2009
NO_QUERY
TurkishDelite
@JonathanRKnight Good Knight hun! Looking forw...
5

603
0
1467964229
Mon Apr 06 23:01:26 PDT 2009
NO_QUERY
farty_brando
@greggrunberg hey you said matt was gonna go a...
5

653
0
1467979491
Mon Apr 06 23:05:45 PDT 2009
NO_QUERY
TeresaUlring
@PaulColes  hmmm...greed is good when it motiv...
5

689
0
1467985114
Mon Apr 06 23:07:22 PDT 2009
NO_QUERY
gladyschock
not feeling v good abt myself
5

745
0
1467997236
Mon Apr 06 23:10:55 PDT 2009
NO_QUERY
alyssaisrad916
UpdatingFFE.  That gives me nothing to do for ...
5

746
0
1467997817
Mon Apr 06 23:11:03 PDT 2009
NO_QUERY
bellalucia
Hot compress not rily workng for pulled muscle...
5

782
0
1468004641
Mon Apr 06 23:12:59 PDT 2009
NO_QUERY
kow_shik
@vivekg86 Good to hear that we have support in...
5

791
0
1468006362
Mon Apr 06 23:13:30 PDT 2009
NO_QUERY
woflln
Ugh can't sleep.  Wish i had a good cuddle to ...
5

869
0
1468030831
Mon Apr 06 23:20:55 PDT 2009
NO_QUERY
StrAbZ
good morning everybody! pkoi y fais pas beau  ...
5

910
0
1468035840
Mon Apr 06 23:22:31 PDT 2009
NO_QUERY
Catmoo
@Sephystryx I've been looking about for good s...
5

1003
0
1468060375
Mon Apr 06 23:30:10 PDT 2009
NO_QUERY
BrandesAsh
'study group extraordinare' about to leave cam...
5

1026
0
1468063973
Mon Apr 06 23:31:15 PDT 2009
NO_QUERY
HilaryBays
@BenPritchett goodness me, how did you find me...
5

1028
0
1468064339
Mon Apr 06 23:31:18 PDT 2009
NO_QUERY
Jenoah1908
my poor little girl has a baaaad rash on her b...
5

...
...
...
...
...
...
...
...

1599486
4
2193453323
Tue Jun 16 08:28:46 PDT 2009
NO_QUERY
janiecwales
@fudgecrumpet Oh good, it will be good to get ...
5

1599511
4
2193454228
Tue Jun 16 08:28:51 PDT 2009
NO_QUERY
panicxx
Good cinema exam!!
5

1599527
4
2193474286
Tue Jun 16 08:30:27 PDT 2009
NO_QUERY
siobhandemers
Good Morning!!  I'm making cookies today
5

1599546
4
2193475013
Tue Jun 16 08:30:30 PDT 2009
NO_QUERY
nataliefyffe
I totally rocked drunken Rockband! Nothin' bet...
5

1599589
4
2193477152
Tue Jun 16 08:30:40 PDT 2009
NO_QUERY
jhollenbach
@MikeRuocco good luck
5

1599592
4
2193477292
Tue Jun 16 08:30:41 PDT 2009
NO_QUERY
kenyonknight
Hmm I dreamed that I woke up in good ol'New Ha...
5

1599609
4
2193478306
Tue Jun 16 08:30:46 PDT 2009
NO_QUERY
loyalwolf06
is SO excited for The Fray concert today!!!!!!...
5

1599614
4
2193478763
Tue Jun 16 08:30:48 PDT 2009
NO_QUERY
Nelinski_254
#mw2 Click on more from this user.. and rate m...
5

1599629
4
2193501365
Tue Jun 16 08:32:37 PDT 2009
NO_QUERY
pianogeek
Good morning everyone!  And to my fabulous UK ...
5

1599642
4
2193501998
Tue Jun 16 08:32:40 PDT 2009
NO_QUERY
PaulHarriott
@hype6477 Good to hear all's cool. I've just s...
5

1599656
4
2193502444
Tue Jun 16 08:32:42 PDT 2009
NO_QUERY
ireneagh
@pinkpeony_etsy you have a lovely shop on etsy...
5

1599660
4
2193502743
Tue Jun 16 08:32:44 PDT 2009
NO_QUERY
Ritter33
rain is good
5

1599690
4
2193503753
Tue Jun 16 08:32:49 PDT 2009
NO_QUERY
mommyperks
@Travelwthemagic Good morning, here. Still jus...
5

1599762
4
2193527876
Tue Jun 16 08:34:47 PDT 2009
NO_QUERY
MKeithHarris
@denniswords good times indeed
5

1599768
4
2193528075
Tue Jun 16 08:34:48 PDT 2009
NO_QUERY
rach3lizabeth
Good morning everyone.
5

1599782
4
2193528659
Tue Jun 16 08:34:51 PDT 2009
NO_QUERY
annaasaywhat
http://bln.kr/T1/ good song (Y)
5

1599821
4
2193551359
Tue Jun 16 08:36:41 PDT 2009
NO_QUERY
HildeM_EN
@Piewacket1 good   sometimes a shot attention ...
5

1599825
4
2193551473
Tue Jun 16 08:36:42 PDT 2009
NO_QUERY
james_bertram
@tonyhawk Yess! Good choice of laptop there, t...
5

1599827
4
2193551560
Tue Jun 16 08:36:42 PDT 2009
NO_QUERY
JadoreConcierge
Good morning Chicago, I love this city!
5

1599828
4
2193551571
Tue Jun 16 08:36:42 PDT 2009
NO_QUERY
crside
@dfinchalicious  It ends up like that when goo...
5

1599841
4
2193552033
Tue Jun 16 08:36:44 PDT 2009
NO_QUERY
josephranseth
Good morning everyone! I hope you take some ti...
5

1599850
4
2193552448
Tue Jun 16 08:36:46 PDT 2009
NO_QUERY
jenniehager
sick, sick, sick today, but still fingers cros...
5

1599856
4
2193552668
Tue Jun 16 08:36:48 PDT 2009
NO_QUERY
CintiaXimena
taking a twit break...a lil blown about my bos...
5

1599869
4
2193553340
Tue Jun 16 08:36:51 PDT 2009
NO_QUERY
nirvannah
@carahsollins Good Morning Lady
5

1599875
4
2193553559
Tue Jun 16 08:36:52 PDT 2009
NO_QUERY
MichiTheReal
@ashleytisdale hih good morning  in austria it...
5

1599907
4
2193575210
Tue Jun 16 08:38:39 PDT 2009
NO_QUERY
themunny
@gabespears morning
5

1599916
4
2193575737
Tue Jun 16 08:38:41 PDT 2009
NO_QUERY
MzGr33n4ppL3
Hello good morning!  i wanna do something today.
5

1599937
4
2193576655
Tue Jun 16 08:38:46 PDT 2009
NO_QUERY
eratyptin
@siahoney I am good thanks!    How is #Eric, I...
5

1599988
4
2193579191
Tue Jun 16 08:38:59 PDT 2009
NO_QUERY
tellman
@Roy_Everitt ha- good job. that's right - we g...
5

1599994
4
2193579489
Tue Jun 16 08:39:00 PDT 2009
NO_QUERY
EvolveTom
@Cliff_Forster Yeah, that does work better tha...
5

67484 rows × 7 columns

``````
``````

In [ ]:

``````
``````

In [103]:

df_[df_['UserID']=='usagiko']

``````
``````

Out[103]:

polarity
tweetID
date
Query
UserID
text
class

290
0
1467882902
Mon Apr 06 22:38:44 PDT 2009
NO_QUERY
usagiko
@LevenRambin: Take it easy, and be good to you.
5

121160
0
1833443461
Sun May 17 23:18:10 PDT 2009
NO_QUERY
usagiko
@danielthomsen: I hope you don't get the hamth...
7

122488
0
1833720820
Mon May 18 00:13:28 PDT 2009
NO_QUERY
usagiko
@rocketgirl13: I'd hug you but I'd be hugging ...
0

132187
0
1835698381
Mon May 18 06:33:52 PDT 2009
NO_QUERY
usagiko
Why am I awake?
0

188985
0
1969000232
Fri May 29 22:45:38 PDT 2009
NO_QUERY
usagiko
@aelysian: we're too popular for our own good,...
5

231465
0
1978970242
Sun May 31 01:07:24 PDT 2009
NO_QUERY
usagiko
@churunga: I still have your gifts to send. I ...
7

258601
0
1985229042
Sun May 31 16:41:30 PDT 2009
NO_QUERY
usagiko
@roanapur: oh honey.  I'm so sorry.
7

379545
0
2052364361
Sat Jun 06 00:29:06 PDT 2009
NO_QUERY
usagiko
@trifluorides: I wish I could use those. But b...
7

380562
0
2052591055
Sat Jun 06 01:16:00 PDT 2009
NO_QUERY
usagiko
@DarthRyu666: I got the raw, since I'm hardcor...
7

411700
0
2060181181
Sat Jun 06 18:25:00 PDT 2009
NO_QUERY
usagiko
@kiptripsyc: I want the new forme of pokemans.
4

415956
0
2061190664
Sat Jun 06 20:21:43 PDT 2009
NO_QUERY
usagiko
@imbrifer: YAYYYY! Also, it'll probably take s...
7

445539
0
2068152200
Sun Jun 07 13:35:19 PDT 2009
NO_QUERY
usagiko
So, depending on what the diagnosis is, we may...
0

446967
0
2068569121
Sun Jun 07 14:18:30 PDT 2009
NO_QUERY
usagiko
@kiddetective: oh honey.
0

529910
0
2195692030
Tue Jun 16 11:28:29 PDT 2009
NO_QUERY
usagiko
@hadleyk: In a &quot;Doctor Who&quot; kinda wa...
4

675244
0
2248034283
Fri Jun 19 20:00:09 PDT 2009
NO_QUERY
usagiko
@LevenRambin: They're making her do what? That...
7

676006
0
2248270018
Fri Jun 19 20:21:41 PDT 2009
NO_QUERY
usagiko
@sampo_ilmari http://twitpic.com/7vaq8 - You l...
1

688126
0
2251447717
Sat Jun 20 03:32:10 PDT 2009
NO_QUERY
usagiko
@finding_jay: I know. You had a very early sta...
7

688951
0
2251644443
Sat Jun 20 04:09:05 PDT 2009
NO_QUERY
usagiko
@finding_jay: My bb.  [gives you massage]
0

719533
0
2260622542
Sat Jun 20 19:30:53 PDT 2009
NO_QUERY
usagiko
Found aloe, so itchy.
0

799748
0
2329114201
Thu Jun 25 10:21:59 PDT 2009
NO_QUERY
usagiko
@finding_jay: I'm sorry I wasn't there, bb.  F...
0

938726
4
1793524597
Thu May 14 03:01:32 PDT 2009
NO_QUERY
usagiko
@LevenRambin: pics or it didn't happen, dude. ...
0

1155596
4
1979023887
Sun May 31 01:19:57 PDT 2009
NO_QUERY
usagiko
@churunga: Also, do you liek Spock?
0

1265419
4
1999438146
Mon Jun 01 20:13:04 PDT 2009
NO_QUERY
usagiko
@socalanon: are we getting an award?
0

``````
``````

In [130]:

classes = {}
for user in df_['UserID'].unique():
vals = {}
for i in df_[df_['UserID']==user]['class']:
if i!=1:
if i in vals:
vals[i]+=1
else:
vals[i]=1

try:
classes[user] = max(vals, key=stats.get)
except ValueError:
classes[user] = 1

``````
``````

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-130-d8b44f498784> in <module>()
2 for user in df_['UserID'].unique():
3     vals = {}
----> 4     for i in df_[df_['UserID']==user]['class']:
5         if i!=1:
6             if i in vals:

/Users/km/anaconda/lib/python2.7/site-packages/pandas/core/ops.pyc in wrapper(self, other, axis)
853
854             with np.errstate(all='ignore'):
--> 855                 res = na_op(values, other)
856             if isscalar(res):
857                 raise TypeError('Could not compare %s type with Series' %

/Users/km/anaconda/lib/python2.7/site-packages/pandas/core/ops.pyc in na_op(x, y)
757
758         if is_object_dtype(x.dtype):
--> 759             result = _comp_method_OBJECT_ARRAY(op, x, y)
760         else:
761

/Users/km/anaconda/lib/python2.7/site-packages/pandas/core/ops.pyc in _comp_method_OBJECT_ARRAY(op, x, y)
737         result = lib.vec_compare(x, y, op)
738     else:
--> 739         result = lib.scalar_compare(x, y, op)
740     return result
741

KeyboardInterrupt:

``````
``````

In [ ]:

import statistics
from statistics import StatisticsError
import random
import math

# colors = ['#d7191c', '#ffffbf', '#2b83ba', 'green','orange','maroon','black']
user_class = []
for g in core7:
try:
try:
Id_Pred = df_[df_['UserID']==user]['class']
X = statistics.mode(ID_Pred[g])
node_colors.append(colors[X])
except StatisticsError:
node_colors.append(colors[ID_Pred[g][random.randint(0,len(ID_Pred[g])-1)]])
except KeyError:
node_colors.append(colors[6])

``````
``````

In [112]:

# df_[df_['UserID']=='ForzaRagazza']
vals = {}
for i in df_[df_['UserID']=='usagiko']['class']:
if i!=1:
if i in vals:
vals[i]+=1
else:
vals[i]=1

max(vals, key=stats.get)

``````
``````

Out[112]:

0

``````
``````

In [100]:

stats = {'a':2,'b':4,'c':1}
max(stats, key=stats.get)

``````
``````

Out[100]:

'b'

``````
``````

In [97]:

df_

``````
``````

Out[97]:

polarity
tweetID
date
Query
UserID
text
class

0
0
1467810369
Mon Apr 06 22:19:45 PDT 2009
NO_QUERY
_TheSpecialOne_
@switchfoot http://twitpic.com/2y1zl - Awww, t...
6

1
0
1467810672
Mon Apr 06 22:19:49 PDT 2009
NO_QUERY
scotthamilton
is upset that he can't update his Facebook by ...
7

2
0
1467810917
Mon Apr 06 22:19:53 PDT 2009
NO_QUERY
mattycus
@Kenichan I dived many times for the ball. Man...
0

3
0
1467811184
Mon Apr 06 22:19:57 PDT 2009
NO_QUERY
ElleCTF
my whole body feels itchy and like its on fire
7

4
0
1467811193
Mon Apr 06 22:19:57 PDT 2009
NO_QUERY
Karoli
@nationwideclass no, it's not behaving at all....
0

5
0
1467811372
Mon Apr 06 22:20:00 PDT 2009
NO_QUERY
joy_wolf
@Kwesidei not the whole crew
0

6
0
1467811592
Mon Apr 06 22:20:03 PDT 2009
NO_QUERY
mybirch
Need a hug
0

7
0
1467811594
Mon Apr 06 22:20:03 PDT 2009
NO_QUERY
coZZ
@LOLTrish hey  long time no see! Yes.. Rains a...
4

8
0
1467811795
Mon Apr 06 22:20:05 PDT 2009
NO_QUERY
2Hood4Hollywood
@Tatiana_K nope they didn't have it
0

9
0
1467812025
Mon Apr 06 22:20:09 PDT 2009
NO_QUERY
mimismo
@twittera que me muera ?
0

10
0
1467812416
Mon Apr 06 22:20:16 PDT 2009
NO_QUERY
erinx3leannexo
spring break in plain city... it's snowing
0

11
0
1467812579
Mon Apr 06 22:20:17 PDT 2009
NO_QUERY
pardonlauren
I just re-pierced my ears
7

12
0
1467812723
Mon Apr 06 22:20:19 PDT 2009
NO_QUERY
TLeC
@caregiving I couldn't bear to watch it.  And ...
0

13
0
1467812771
Mon Apr 06 22:20:19 PDT 2009
NO_QUERY
robrobbierobert
@octolinz16 It it counts, idk why I did either...
0

14
0
1467812784
Mon Apr 06 22:20:20 PDT 2009
NO_QUERY
bayofwolves
@smarrison i would've been the first, but i di...
7

15
0
1467812799
Mon Apr 06 22:20:20 PDT 2009
NO_QUERY
HairByJess
@iamjazzyfizzle I wish I got to watch it with ...
4

16
0
1467812964
Mon Apr 06 22:20:22 PDT 2009
NO_QUERY
lovesongwriter
Hollis' death scene will hurt me severely to w...
0

17
0
1467813137
Mon Apr 06 22:20:25 PDT 2009
NO_QUERY
armotley
about to file taxes
0

18
0
1467813579
Mon Apr 06 22:20:31 PDT 2009
NO_QUERY
starkissed
@LettyA ahh ive always wanted to see rent  lov...
0

19
0
1467813782
Mon Apr 06 22:20:34 PDT 2009
NO_QUERY
gi_gi_bee
@FakerPattyPattz Oh dear. Were you drinking ou...
0

20
0
1467813985
Mon Apr 06 22:20:37 PDT 2009
NO_QUERY
quanvu
@alydesigns i was out most of the day so didn'...
4

21
0
1467813992
Mon Apr 06 22:20:38 PDT 2009
NO_QUERY
swinspeedx
one of my friend called me, and asked to meet ...
7

22
0
1467814119
Mon Apr 06 22:20:40 PDT 2009
NO_QUERY
cooliodoc
@angry_barista I baked you a cake but I ated it
0

23
0
1467814180
Mon Apr 06 22:20:40 PDT 2009
NO_QUERY
viJILLante
this week is not going as i had hoped
7

24
0
1467814192
Mon Apr 06 22:20:41 PDT 2009
NO_QUERY
Ljelli3166
blagh class at 8 tomorrow
0

25
0
1467814438
Mon Apr 06 22:20:44 PDT 2009
NO_QUERY
ChicagoCubbie
I hate when I have to call and wake people up
7

26
0
1467814783
Mon Apr 06 22:20:50 PDT 2009
NO_QUERY
KatieAngell
Just going to cry myself to sleep after watchi...
4

27
0
1467814883
Mon Apr 06 22:20:52 PDT 2009
NO_QUERY
gagoo
im sad now  Miss.Lilly
7

28
0
1467815199
Mon Apr 06 22:20:56 PDT 2009
NO_QUERY
abel209
ooooh.... LOL  that leslie.... and ok I won't ...
0

29
0
1467815753
Mon Apr 06 22:21:04 PDT 2009
NO_QUERY
BaptisteTheFool
Meh... Almost Lover is the exception... this t...
0

...
...
...
...
...
...
...
...

1599970
4
2193578196
Tue Jun 16 08:38:54 PDT 2009
NO_QUERY
adbillingsley
Thanks @eastwestchic &amp; @wangyip Thanks! Th...
4

1599971
4
2193578237
Tue Jun 16 08:38:54 PDT 2009
NO_QUERY
gekkko
@marttn thanks Martin. not the most imaginativ...
0

1599972
4
2193578269
Tue Jun 16 08:38:54 PDT 2009
NO_QUERY
millerslab
@MikeJonesPhoto Congrats Mike  Way to go!
0

1599973
4
2193578319
Tue Jun 16 08:38:55 PDT 2009
NO_QUERY
luckygeorgeblog
http://twitpic.com/7jp4n - OMG! Office Space.....
1

1599974
4
2193578345
Tue Jun 16 08:38:55 PDT 2009
NO_QUERY
Kristah_Diggs
@yrclndstnlvr ahaha nooo you were just away fr...
7

1599975
4
2193578347
Tue Jun 16 08:38:55 PDT 2009
NO_QUERY
CoachChic
@BizCoachDeb  Hey, I'm baack! And, thanks so m...
0

1599976
4
2193578348
Tue Jun 16 08:38:55 PDT 2009
NO_QUERY
serianna
@mattycus Yeah, my conscience would be clear i...
0

1599977
4
2193578386
Tue Jun 16 08:38:55 PDT 2009
NO_QUERY
TeamUKskyvixen
@MayorDorisWolfe Thats my girl - dishing out t...
7

1599978
4
2193578395
Tue Jun 16 08:38:55 PDT 2009
NO_QUERY
LaurenMoo10
@shebbs123 i second that
0

1599979
4
2193578576
Tue Jun 16 08:38:57 PDT 2009
NO_QUERY
angel_sammy04
In the garden
0

1599980
4
2193578679
Tue Jun 16 08:38:56 PDT 2009
NO_QUERY
puchal_ek
@myheartandmind jo jen by nemuselo zrovna tÃ© ...
0

1599981
4
2193578716
Tue Jun 16 08:38:57 PDT 2009
NO_QUERY
youtubelatest
Another Commenting Contest! [;: Yay!!!  http:/...
1

1599982
4
2193578739
Tue Jun 16 08:38:57 PDT 2009
NO_QUERY
Mandi_Davenport
@thrillmesoon i figured out how to see my twee...
0

1599983
4
2193578758
Tue Jun 16 08:38:57 PDT 2009
NO_QUERY
xoAurixo
@oxhot theri tomorrow, drinking coffee, talkin...
7

1599984
4
2193578847
Tue Jun 16 08:38:57 PDT 2009
NO_QUERY
RobFoxKerr
You heard it here first -- We're having a girl...
0

1599985
4
2193578982
Tue Jun 16 08:38:58 PDT 2009
NO_QUERY
LISKFEST
if ur the lead singer in a band, beware fallin...
6

1599986
4
2193579087
Tue Jun 16 08:38:58 PDT 2009
NO_QUERY
marhgil
@tarayqueen too much ads on my blog.
0

1599987
4
2193579092
Tue Jun 16 08:38:58 PDT 2009
NO_QUERY
cathriiin
@La_r_a NEVEER  I think that you both will get...
7

1599988
4
2193579191
Tue Jun 16 08:38:59 PDT 2009
NO_QUERY
tellman
@Roy_Everitt ha- good job. that's right - we g...
5

1599989
4
2193579211
Tue Jun 16 08:38:59 PDT 2009
NO_QUERY
jazzstixx
@Ms_Hip_Hop im glad ur doing well
7

1599990
4
2193579249
Tue Jun 16 08:38:59 PDT 2009
NO_QUERY
razzberry5594
WOOOOO! Xbox is back
0

1599991
4
2193579284
Tue Jun 16 08:38:59 PDT 2009
NO_QUERY
AgustinaP
@rmedina @LaTati Mmmm  That sounds absolutely ...
7

1599992
4
2193579434
Tue Jun 16 08:39:00 PDT 2009
NO_QUERY
sdancingsteph
ReCoVeRiNg FrOm ThE lOnG wEeKeNd
0

1599993
4
2193579477
Tue Jun 16 08:39:00 PDT 2009
NO_QUERY
ChloeAmisha
@SCOOBY_GRITBOYS
0

1599994
4
2193579489
Tue Jun 16 08:39:00 PDT 2009
NO_QUERY
EvolveTom
@Cliff_Forster Yeah, that does work better tha...
4

1599995
4
2193601966
Tue Jun 16 08:40:49 PDT 2009
NO_QUERY
AmandaMarie1028
Just woke up. Having no school is the best fee...
4

1599996
4
2193601969
Tue Jun 16 08:40:49 PDT 2009
NO_QUERY
TheWDBoards
TheWDB.com - Very cool to hear old Walt interv...
6

1599997
4
2193601991
Tue Jun 16 08:40:49 PDT 2009
NO_QUERY
bpbabe
Are you ready for your MoJo Makeover? Ask me f...
0

1599998
4
2193602064
Tue Jun 16 08:40:49 PDT 2009
NO_QUERY
tinydiamondz
Happy 38th Birthday to my boo of alll time!!! ...
7

1599999
4
2193602129
Tue Jun 16 08:40:50 PDT 2009
NO_QUERY
RyanTrevMorris
happy #charitytuesday @theNSPCC @SparksCharity...
0

1600000 rows × 7 columns

``````
``````

In [95]:

dfff = pd.DataFrame({'A': [1, 2, 1, 2, 1, 2, 3],'B': [5, 6, 5, 5, 5, 7, 6]})
# df.mode()

``````
``````

In [73]:

# df.columns#
df[u'UserID'].unique()

``````
``````

Out[73]:

array([u'_TheSpecialOne_', u'scotthamilton', u'mattycus', ...,
u'EvolveTom', u'AmandaMarie1028', u'bpbabe'], dtype=object)

``````
``````

In [66]:

#Kmeans

# Clustering with some parameters.

from sklearn.cluster import KMeans
ncl = 8
k = 30
kmeans_30 = KMeans(n_clusters=ncl, init='k-means++', max_iter=100, n_init=10,random_state=0)
y_30 = kmeans_30.fit_predict(X[:,:30])
# centroids = kmeans.cluster_centers_
# labels = kmeans.labels_
# error = kmeans.inertia_
y_30

``````
``````

Out[66]:

array([0, 1, 1, ..., 1, 1, 1], dtype=int32)

``````
``````

In [ ]:

# # GG=nx.Graph() # so that we dont destroy G if we start running this cell

# m=0
# n=0

# for index, row in df_.iterrows():
#     G.add_node(row[4])
#     if '@' in row[5]:
#         m+=1
#         for t in re.split('[^a-zA-Z\_\@]', row[5]):
#             if t!='' and t[0]=='@':
#                 G.add_edge(row[4],t[1:])
#                 n+=1

``````
``````

In [127]:

colors = ['b','w','r','g','c','m','y','k']
# g : green.
# r : red.
# c : cyan.
# m : magenta.
# y : yellow.
# k : black.
# w : white.

``````
``````

In [125]:

i=0
labeled_nodes = []
for x in core7.nodes():
labeled_nodes.append( (x, 1) )
try:
labeled_nodes[i]=( (x,classes[x]) )
except KeyError:
pass
i+=1

``````
``````

In [ ]:

labeled

``````
``````

In [129]:

fig = plt.figure(figsize=(12,6))
ax = plt.subplot(111)

# labeled_nodes = [(x,classes[x]) for x in core7.nodes()]

# which = np.random.choice(range(len(labeled_nodes)),500)
which = range(len(core7))
# mini_g = core7.subgraph([labeled_nodes[i][0] for i in which])
mini_g = core7
node_colors = [colors[labeled_nodes[i][1]] for i in which]

nx.draw(mini_g, node_color=node_colors,node_size=100, ax=ax, with_labels='False',
alpha =0.2, font_size=0,width=0.1)
# nx.draw(Gc_core,nodelist=Gc_core.nodes()[:100], node_color=node_colors,node_size=100, ax=ax, with_labels='False',
#        alpha =0.2, font_size=0,width=0.1)
# nx.draw(Gc_core, node_color=node_colors,node_size=10, ax=ax, with_labels='True', font_size=16)

``````
``````

``````
``````

In [74]:

type(G.nodes()[0])

``````
``````

Out[74]:

str

``````
``````

In [79]:

G.nodes()[1]

``````
``````

Out[79]:

'elmoberry'

``````
``````

In [107]:

list(y_30).count(0)

``````
``````

Out[107]:

52429

``````
``````

In [108]:

list(y_30).count(1)

``````
``````

Out[108]:

1247302

``````
``````

In [110]:

for i in xrange(ncl):
print list(y_30).count(i)

``````
``````

52429
1247302
43656
32481
33136
67484
52836
70676

``````
``````

In [111]:

# #Kmeans

# # Clustering with some parameters.

# from sklearn.cluster import KMeans
# ncl = 20
# k = 10
# kmeans_10_100 = KMeans(n_clusters=ncl, init='k-means++', max_iter=100, n_init=10,random_state=0)
# y_10_100 = kmeans_10_100.fit_predict(X[:,:10])
# # centroids = kmeans.cluster_centers_
# # labels = kmeans.labels_
# # error = kmeans.inertia_
# y_10_100

``````
``````

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-111-dc0405c5162d> in <module>()
7 k = 10
8 kmeans_10_100 = KMeans(n_clusters=ncl, init='k-means++', max_iter=100, n_init=10,random_state=0)
----> 9 y_10_100 = kmeans_10_100.fit_predict(X[:,:10])
10 # centroids = kmeans.cluster_centers_
11 # labels = kmeans.labels_

/Users/km/anaconda/lib/python2.7/site-packages/sklearn/cluster/k_means_.pyc in fit_predict(self, X, y)
828         predict(X).
829         """
--> 830         return self.fit(X).labels_
831
832     def fit_transform(self, X, y=None):

/Users/km/anaconda/lib/python2.7/site-packages/sklearn/cluster/k_means_.pyc in fit(self, X, y)
819                 precompute_distances=self.precompute_distances,
820                 tol=self.tol, random_state=random_state, copy_x=self.copy_x,
--> 821                 n_jobs=self.n_jobs)
822         return self
823

/Users/km/anaconda/lib/python2.7/site-packages/sklearn/cluster/k_means_.pyc in k_means(X, n_clusters, init, precompute_distances, n_init, max_iter, verbose, tol, random_state, copy_x, n_jobs, return_n_iter)
322                 X, n_clusters, max_iter=max_iter, init=init, verbose=verbose,
323                 precompute_distances=precompute_distances, tol=tol,
--> 324                 x_squared_norms=x_squared_norms, random_state=random_state)
325             # determine if these results are the best so far
326             if best_inertia is None or inertia < best_inertia:

/Users/km/anaconda/lib/python2.7/site-packages/sklearn/cluster/k_means_.pyc in _kmeans_single(X, n_clusters, x_squared_norms, max_iter, init, verbose, random_state, tol, precompute_distances)
445             _labels_inertia(X, x_squared_norms, centers,
446                             precompute_distances=precompute_distances,
--> 447                             distances=distances)
448
449         # computation of the means is also called the M-step of EM

/Users/km/anaconda/lib/python2.7/site-packages/sklearn/cluster/k_means_.pyc in _labels_inertia(X, x_squared_norms, centers, precompute_distances, distances)
576                                                     centers, distances)
577         inertia = _k_means._assign_labels_array(
--> 578             X, x_squared_norms, centers, labels, distances=distances)
579     return labels, inertia
580

KeyboardInterrupt:

``````
``````

In [ ]:

for i in xrange(100):
if pred[i]==1:
print TextList[i]

``````