Test copy



In [1]:
%matplotlib inline
import networkx as nx
import csv
import re
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from networkx.algorithms.connectivity import minimum_st_edge_cut
from networkx.algorithms.flow import shortest_augmenting_path
from sklearn.cluster import KMeans

In [2]:
def L1 (x,y):
    dist = 0
    if len(x)==len(y):
        for i in range(len(x)):
            dist += math.fabs(x[i]-y[i])
        return(dist)
    else:
        print('vectors must be equal length for L1')
        return (Null)

In [3]:
i = 0

with open('training.1600000.processed.noemoticon.csv') as f_in:
    for line in f_in:
        print (list(csv.reader(line, skipinitialspace=True)))
        print (line)
        i+=1
        if i>3:
            break


[['0'], ['', ''], ['1467810369'], ['', ''], ['Mon Apr 06 22:19:45 PDT 2009'], ['', ''], ['NO_QUERY'], ['', ''], ['_TheSpecialOne_'], ['', ''], ["@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"], []]
"0","1467810369","Mon Apr 06 22:19:45 PDT 2009","NO_QUERY","_TheSpecialOne_","@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

[['0'], ['', ''], ['1467810672'], ['', ''], ['Mon Apr 06 22:19:49 PDT 2009'], ['', ''], ['NO_QUERY'], ['', ''], ['scotthamilton'], ['', ''], ["is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!"], []]
"0","1467810672","Mon Apr 06 22:19:49 PDT 2009","NO_QUERY","scotthamilton","is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!"

[['0'], ['', ''], ['1467810917'], ['', ''], ['Mon Apr 06 22:19:53 PDT 2009'], ['', ''], ['NO_QUERY'], ['', ''], ['mattycus'], ['', ''], ['@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds'], []]
"0","1467810917","Mon Apr 06 22:19:53 PDT 2009","NO_QUERY","mattycus","@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds"

[['0'], ['', ''], ['1467811184'], ['', ''], ['Mon Apr 06 22:19:57 PDT 2009'], ['', ''], ['NO_QUERY'], ['', ''], ['ElleCTF'], ['', ''], ['my whole body feels itchy and like its on fire '], []]
"0","1467811184","Mon Apr 06 22:19:57 PDT 2009","NO_QUERY","ElleCTF","my whole body feels itchy and like its on fire "


In [14]:
"""0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
1 - the id of the tweet (2087)
2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)
3 - the query. If there is no query, then this value is NO_QUERY.
4 - the user that tweeted
5 - the text of the tweet"""
cols = ['polarity','tweetID','date','Query','UserID','text']
df = pd.read_csv('training.1600000.processed.noemoticon.csv',names=cols,encoding='latin-1')#names=m_cols ,

In [4]:
G=nx.Graph()
m=0
n=0

# with open('training.1600000.processed.noemoticon.csv', encoding='latin-1') as f_in:
with open('training.1600000.processed.noemoticon.csv') as f_in:
    for line in f_in:
        lineX = list(csv.reader(line, skipinitialspace=True))
        G.add_node(lineX[8][0])
        if '@' in lineX[10][0]:
            m+=1
            for t in re.split('[^a-zA-Z\_\@]', lineX[10][0]):
                if t!='' and t[0]=='@' and t!='@':
                    G.add_edge(lineX[8][0],t[1:])
                    n+=1
        if n%100000==0:
            print(n)
print(nx.number_of_nodes(G))


100000
100000
100000
100000
100000
100000
100000
200000
200000
300000
300000
400000
500000
600000
700000
889334

In [6]:
print(nx.number_of_edges(G))


616462

In [ ]:


In [7]:
"""0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
1 - the id of the tweet (2087)
2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)
3 - the query. If there is no query, then this value is NO_QUERY.
4 - the user that tweeted
5 - the text of the tweet"""
cols = ['polarity','tweetID','date','Query','UserID','text']
df = pd.read_csv('training.1600000.processed.noemoticon.csv',names=cols,encoding='latin-1')#names=m_cols ,

In [15]:
G=nx.Graph() # so that we dont destroy G if we start running this cell

m=0
n=0

for index, row in df.iterrows():
    G.add_node(row[4])
    if '@' in row[5]:
        m+=1
        for t in re.split('[^a-zA-Z\_\@]', row[5]):
            if t!='' and t[0]=='@':
                G.add_edge(row[4],t[1:])
                n+=1

In [16]:
len(G)


Out[16]:
889335

In [8]:
float(nx.number_of_edges(G))/float(nx.number_of_nodes(G))


Out[8]:
0.6931726437986179

In [ ]:


In [16]:
DegList = list(nx.degree(G).values())
DegDic = {}
for D in DegList:
    if D in DegDic:
        DegDic[D] += 1
    else:
        DegDic[D] = 1

In [19]:
plt.yscale('log')
plt.ylabel('Log Count')
plt.title('Log plot of Degree Distribution of Graph')
plt.xscale('linear')
plt.xlabel('Degree')
plt.hist(DegList,bins=100)


Out[19]:
(array([  8.88118000e+05,   8.90000000e+02,   1.70000000e+02,
          5.80000000e+01,   2.70000000e+01,   1.40000000e+01,
          9.00000000e+00,   5.00000000e+00,   1.30000000e+01,
          4.00000000e+00,   6.00000000e+00,   2.00000000e+00,
          2.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          2.00000000e+00,   0.00000000e+00,   2.00000000e+00,
          1.00000000e+00,   0.00000000e+00,   2.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   2.00000000e+00,
          1.00000000e+00,   1.00000000e+00,   1.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.00000000e+00]),
 array([    0. ,    34.5,    69. ,   103.5,   138. ,   172.5,   207. ,
          241.5,   276. ,   310.5,   345. ,   379.5,   414. ,   448.5,
          483. ,   517.5,   552. ,   586.5,   621. ,   655.5,   690. ,
          724.5,   759. ,   793.5,   828. ,   862.5,   897. ,   931.5,
          966. ,  1000.5,  1035. ,  1069.5,  1104. ,  1138.5,  1173. ,
         1207.5,  1242. ,  1276.5,  1311. ,  1345.5,  1380. ,  1414.5,
         1449. ,  1483.5,  1518. ,  1552.5,  1587. ,  1621.5,  1656. ,
         1690.5,  1725. ,  1759.5,  1794. ,  1828.5,  1863. ,  1897.5,
         1932. ,  1966.5,  2001. ,  2035.5,  2070. ,  2104.5,  2139. ,
         2173.5,  2208. ,  2242.5,  2277. ,  2311.5,  2346. ,  2380.5,
         2415. ,  2449.5,  2484. ,  2518.5,  2553. ,  2587.5,  2622. ,
         2656.5,  2691. ,  2725.5,  2760. ,  2794.5,  2829. ,  2863.5,
         2898. ,  2932.5,  2967. ,  3001.5,  3036. ,  3070.5,  3105. ,
         3139.5,  3174. ,  3208.5,  3243. ,  3277.5,  3312. ,  3346.5,
         3381. ,  3415.5,  3450. ]),
 <a list of 100 Patch objects>)

In [20]:
DegList = list(nx.degree(G).items())
for D in DegList:
    if D[1]>3000:
        print(D)


('mileycyrus', 3450)

In [21]:
plt.title('Log-Log of Degree Distribution of Graph')
plt.ylabel('Log Count')
plt.xlabel('Log Degree')
DegList = sorted(DegDic.items())
Xlist, Ylist = zip(*DegList)
plt.loglog(Xlist,Ylist, basex=np.e, basey=np.e)
del Xlist
del Ylist



In [ ]:


In [27]:
del DegDic
del DegList

In [9]:
LargestCC = max(nx.connected_component_subgraphs(G), key=len) # largest connected component
print(nx.number_of_nodes(LargestCC))


339766

In [120]:
LargestCC.remove_edges_from(LargestCC.selfloop_edges())
scaler = MinMaxScaler((50,800))

In [ ]:
CoreCounts = []
for i in range(2,10):
    core_i = nx.k_core(LargestCC, i)
    CoreCounts.append(nx.number_of_nodes(core_i))
del core_i

In [ ]:
plt.yscale('log')
plt.plot(range(2,10),CoreCounts)

In [37]:
#WOWWWWWWWW worth including


Out[37]:
[<matplotlib.lines.Line2D at 0x7f8727360208>]

In [10]:
core7 = nx.k_core(LargestCC,7)


---------------------------------------------------------------------------
NetworkXError                             Traceback (most recent call last)
<ipython-input-10-fb53b157c762> in <module>()
----> 1 core7 = nx.k_core(LargestCC,7)

/Users/km/anaconda/lib/python2.7/site-packages/networkx/algorithms/core.pyc in k_core(G, k, core_number)
    151     """
    152     if core_number is None:
--> 153         core_number=nx.core_number(G)
    154     if k is None:
    155         k=max(core_number.values()) # max core

/Users/km/anaconda/lib/python2.7/site-packages/networkx/algorithms/core.pyc in core_number(G)
     68         raise nx.NetworkXError(
     69                 'Input graph has self loops; the core number is not defined.',
---> 70                 'Consider using G.remove_edges_from(G.selfloop_edges()).')
     71 
     72     if G.is_directed():

NetworkXError: ('Input graph has self loops; the core number is not defined.', 'Consider using G.remove_edges_from(G.selfloop_edges()).')

In [11]:
Bcent = np.array(list(nx.betweenness_centrality(core7,normalized = True).values()))
scaledBC = scaler.fit_transform(Bcent)


/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/data.py:321: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/data.py:356: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)

In [12]:
Ecent = np.array(list(nx.eigenvector_centrality_numpy(core7).values()))
scaledEC = scaler.fit_transform(Ecent[:,np.newaxis])

In [ ]:
Ccent = np.array(list(nx.closeness_centrality(core7).values()))
scaledCC = scaler.fit_transform(Ccent[:,np.newaxis])

In [17]:
L1_dist = [L1(scaledCC,scaledBC),L1(scaledEC,scaledBC),L1(scaledEC,scaledCC)]
print("""From the three measures we have explored, Eigenvalue centality
Betweenness centality and Closeness centality. We can now evaluate the
L1 distance between the measures""")
D = L1_dist[0]
print("The L1 distance between Closeness centality and Betweenness Centrality is %d implying average distance of %f"%\
      (D, D*1.0/nx.number_of_nodes(core7)))
D = L1_dist[1]
print("The L1 distance between Eigenvalue centality and Betweenness Centrality is %d implying average distance of %f"%\
      (D, D*1.0/nx.number_of_nodes(core7)))
D = L1_dist[2]
print("The L1 distance between Closeness centality and Eigenvalue Centrality is %d implying average distance of %f"%\
      (D, D*1.0/nx.number_of_nodes(core7)))


From the three measures we have explored, Eigenvalue centality
Betweenness centality and Closeness centality. We can now evaluate the
L1 distance between the measures
The L1 distance between Closeness centality and Betweenness Centrality is 386829 implying average distance of 391.527373
The L1 distance between Eigenvalue centality and Betweenness Centrality is 45735 implying average distance of 46.290495
The L1 distance between Closeness centality and Eigenvalue Centrality is 371185 implying average distance of 375.693507

In [18]:
plt.yscale('log')
plt.hist(scaledBC)


Out[18]:
(array([ 929.,   19.,   23.,    5.,    6.,    2.,    2.,    0.,    1.,    1.]),
 array([  50.,  125.,  200.,  275.,  350.,  425.,  500.,  575.,  650.,
         725.,  800.]),
 <a list of 10 Patch objects>)

In [19]:
plt.yscale('log')
plt.hist(scaledEC)


Out[19]:
(array([ 780.,  146.,   48.,    7.,    0.,    2.,    0.,    2.,    1.,    2.]),
 array([  50.,  125.,  200.,  275.,  350.,  425.,  500.,  575.,  650.,
         725.,  800.]),
 <a list of 10 Patch objects>)

In [21]:
plt.yscale('log')
plt.hist(scaledCC)


Out[21]:
(array([  11.,   29.,   31.,   57.,  175.,  252.,  332.,   71.,   21.,    9.]),
 array([  50.,  125.,  200.,  275.,  350.,  425.,  500.,  575.,  650.,
         725.,  800.]),
 <a list of 10 Patch objects>)

In [11]:
f = nx.fiedler_vector(core7)
s = np.zeros(len(f),dtype='int')
s[f>0]=1


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-a697dcd2cdf7> in <module>()
----> 1 f = nx.fiedler_vector(core7)
      2 s = np.zeros(len(f),dtype='int')
      3 s[f>0]=1

NameError: name 'core7' is not defined

In [12]:
colors = ['#d7191c', '#2b83ba']
node_colors = [colors[s[v]] for v in range(nx.number_of_nodes(core7))]
nx.draw(core7, node_color=node_colors,node_size=10)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-495c7a8b012f> in <module>()
      1 colors = ['#d7191c', '#2b83ba']
----> 2 node_colors = [colors[s[v]] for v in range(nx.number_of_nodes(core7))]
      3 nx.draw(core7, node_color=node_colors,node_size=10)

NameError: name 'core7' is not defined

In [55]:
L = nx.laplacian_matrix(core7).todense()
w, v = np.linalg.eig(L)
v = np.array(v)
worder = np.argsort(w)
#pos = {i: np.array([f[0], f[1]]) for i, f in enumerate(zip(v[:,worder[1]], v[:,worder[2]]))}

In [56]:
X = v @ np.diag(w)
X = X[:,worder]

In [57]:
error = np.zeros(9)
for k in range(2,11):
    kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10)
    kmeans.fit_predict(X[:,1:3])
    error[k-2] = kmeans.inertia_

In [58]:
plt.plot(range(2,11),error)


Out[58]:
[<matplotlib.lines.Line2D at 0x7f61c24a39b0>]

In [68]:
kmeans = KMeans(init='k-means++', n_clusters=6, n_init=10)
kmeans.fit_predict(X[:,1:3])
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_

In [70]:
colors = ['#d7191c', '#ffffbf', '#2b83ba', 'green','orange','maroon']
node_colors = [colors[labels[i]] for i in range(nx.number_of_nodes(core7))]
nx.draw(core7, node_color=node_colors,node_size=10)


/home/ubuntu/anaconda3/lib/python3.6/site-packages/networkx/drawing/nx_pylab.py:126: MatplotlibDeprecationWarning: pyplot.hold is deprecated.
    Future behavior will be consistent with the long-time default:
    plot commands add elements without first clearing the
    Axes and/or Figure.
  b = plt.ishold()
/home/ubuntu/anaconda3/lib/python3.6/site-packages/networkx/drawing/nx_pylab.py:138: MatplotlibDeprecationWarning: pyplot.hold is deprecated.
    Future behavior will be consistent with the long-time default:
    plot commands add elements without first clearing the
    Axes and/or Figure.
  plt.hold(b)
/home/ubuntu/anaconda3/lib/python3.6/site-packages/matplotlib/__init__.py:917: UserWarning: axes.hold is deprecated. Please remove it from your matplotlibrc and/or style files.
  warnings.warn(self.msg_depr_set % key)
/home/ubuntu/anaconda3/lib/python3.6/site-packages/matplotlib/rcsetup.py:152: UserWarning: axes.hold is deprecated, will be removed in 3.0
  warnings.warn("axes.hold is deprecated, will be removed in 3.0")

In [23]:
import sklearn
print sklearn.__version__


0.17.1

In [18]:
import 
from sklearn import mixture

mixture.GaussianMixture()


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-18-a1a84fb1c0cf> in <module>()
      2 from sklearn import mixture
      3 
----> 4 mixture.GaussianMixture()

AttributeError: 'module' object has no attribute 'GaussianMixture'

In [67]:
vectorizer = TfidfVectorizer(stop_words='english', min_df=8, max_df=0.8)
dtm = vectorizer.fit_transform(TextList)

del TextList
terms = vectorizer.get_feature_names()
print("Finished")


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-67-b0bc613432c5> in <module>()
----> 1 vectorizer = TfidfVectorizer(stop_words='english', min_df=8, max_df=0.8)
      2 dtm = vectorizer.fit_transform(TextList)
      3 
      4 del TextList
      5 terms = vectorizer.get_feature_names()

NameError: name 'TfidfVectorizer' is not defined

In [5]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv',names=cols,encoding='latin-1')#names=m_cols ,


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-7153293e4f30> in <module>()
----> 1 df = pd.read_csv('training.1600000.processed.noemoticon.csv',names=cols,encoding='latin-1')#names=m_cols ,

NameError: name 'cols' is not defined

In [25]:
prefix = './trainingandtestdata/'

testfile = prefix + 'testdata.manual.2009.06.14.csv'
trainfile = prefix + 'training.1600000.processed.noemoticon.csv'

In [27]:
df = pd.read_csv(trainfile,names=cols,encoding='latin-1')#names=m_cols ,

In [28]:
len(df)


Out[28]:
1600000

In [29]:
df.head()


Out[29]:
polarity tweetID date Query UserID text
0 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY _TheSpecialOne_ @switchfoot http://twitpic.com/2y1zl - Awww, t...
1 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton is upset that he can't update his Facebook by ...
2 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus @Kenichan I dived many times for the ball. Man...
3 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF my whole body feels itchy and like its on fire
4 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli @nationwideclass no, it's not behaving at all....

In [41]:
df_small = df.iloc[:600000]

In [31]:
len(df_small)


Out[31]:
1000

In [44]:
#long
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', min_df=8,max_df=0.8)
M = vectorizer.fit_transform(df.text)

In [48]:
print type(M)
M


<class 'scipy.sparse.csr.csr_matrix'>
Out[48]:
<1600000x52521 sparse matrix of type '<type 'numpy.float64'>'
	with 10140434 stored elements in Compressed Sparse Row format>

In [55]:
#doesnt work well

# from scipy import io

# with open('M.mtx','w') as fout:
#     io.mmwrite(fout, M)#, comment='', field=None, precision=None, symmetry=None)[source]

# io.mmwrite('M', M)

In [53]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50, n_iter=10, random_state=42)
X = svd.fit_transform(M)

In [139]:
svd.components_


Out[139]:
array([[  2.72001580e-03,   1.69149426e-03,   1.09716343e-04, ...,
          2.72950482e-05,   3.23040176e-06,   5.18766711e-06],
       [ -4.29279707e-04,   1.22860704e-03,   5.74032116e-05, ...,
          3.49286375e-05,   7.78247908e-06,   1.07673030e-05],
       [ -6.20510551e-04,  -2.94362336e-04,  -4.71793942e-05, ...,
          3.79566391e-06,  -8.31496087e-08,   3.20352253e-06],
       ..., 
       [ -1.17122879e-04,  -6.05301438e-05,   1.80451782e-05, ...,
         -1.61457500e-05,   8.85458117e-07,  -1.28697769e-06],
       [ -5.88878369e-04,  -3.37554068e-04,   7.58171236e-05, ...,
          1.73194680e-06,   4.35612975e-07,  -2.37956566e-06],
       [ -8.27196817e-04,  -2.17366492e-04,  -2.30040004e-05, ...,
         -3.65960547e-05,   1.45690214e-06,  -1.99006957e-06]])

In [56]:
X.shape


Out[56]:
(1600000, 50)

In [57]:
#Kmeans

# Clustering with some parameters.

from sklearn.cluster import KMeans
ncl = 8
k = 30
kmeans = KMeans(n_clusters=ncl, init='k-means++', max_iter=100, n_init=10,random_state=0)
y = kmeans.fit_predict(X[:,:k])
# centroids = kmeans.cluster_centers_
# labels = kmeans.labels_
# error = kmeans.inertia_
y


Out[57]:
array([6, 7, 0, ..., 0, 7, 0], dtype=int32)

In [ ]:
for tweets in df[:1000]

In [59]:
df_ = df.copy()

In [131]:
df_['class'] = y_30

In [138]:
df_[df_['class']==4]


Out[138]:
polarity tweetID date Query UserID text class
103 0 1467837470 Mon Apr 06 22:26:43 PDT 2009 NO_QUERY annette414 watching &quot;House&quot; 4
119 0 1467839586 Mon Apr 06 22:27:18 PDT 2009 NO_QUERY sonyolmos @eRRe_sC aaw i miss ya all too.. im leaving to... 4
124 0 1467840552 Mon Apr 06 22:27:34 PDT 2009 NO_QUERY weefranniev Late night snack, glass of OJ b/c I'm &quot;do... 4
183 0 1467858363 Mon Apr 06 22:32:12 PDT 2009 NO_QUERY schammy Downloading NIN's new album &quot;the slip&quo... 4
206 0 1467862710 Mon Apr 06 22:33:20 PDT 2009 NO_QUERY Jemimus My mind and body are severely protesting this ... 4
213 0 1467863684 Mon Apr 06 22:33:35 PDT 2009 NO_QUERY DjGundam Awwh babs... you look so sad underneith that s... 4
286 0 1467881920 Mon Apr 06 22:38:28 PDT 2009 NO_QUERY mumu1210 FML: So much for seniority, bc of technologica... 4
319 0 1467894593 Mon Apr 06 22:41:52 PDT 2009 NO_QUERY Tanja71 @JonathanRKnight Oh! Did I mention it? &quot;G... 4
393 0 1467911624 Mon Apr 06 22:46:32 PDT 2009 NO_QUERY Mati_UOIT Sitting here wondering why &quot;ED&quot; stil... 4
400 0 1467913111 Mon Apr 06 22:46:57 PDT 2009 NO_QUERY nssmom #3 woke up and was having an accident - &quot;... 4
473 0 1467931070 Mon Apr 06 22:52:06 PDT 2009 NO_QUERY calee01 &quot;On popular music&quot; by T.W.Adorno is ... 4
492 0 1467934004 Mon Apr 06 22:52:56 PDT 2009 NO_QUERY malice_sin pears &amp; Brie, bottle of Cabernet, and &quo... 4
510 0 1467943007 Mon Apr 06 22:55:30 PDT 2009 NO_QUERY vibratoria @stuiy never again will I click on a link that... 4
569 0 1467952985 Mon Apr 06 22:58:24 PDT 2009 NO_QUERY mickeyness @daniela_95616 hahaa!! i just realized &quot;i... 4
592 0 1467962336 Mon Apr 06 23:00:55 PDT 2009 NO_QUERY umfoo my heart is broken every morning dropping Foo ... 4
725 0 1467992696 Mon Apr 06 23:09:35 PDT 2009 NO_QUERY pop_corn_ I feel like I am the only &quot;twitterer&quot... 4
742 0 1467996096 Mon Apr 06 23:10:33 PDT 2009 NO_QUERY Beatmixology @djsoulsister yeah, great vid. I had the 12&qu... 4
747 0 1467998037 Mon Apr 06 23:11:06 PDT 2009 NO_QUERY amitgupta Have an invite for &quot;Healthy Dining&quot; ... 4
817 0 1468013020 Mon Apr 06 23:15:30 PDT 2009 NO_QUERY melvinchia has a mild left inner ear infection.. and its ... 4
879 0 1468031882 Mon Apr 06 23:21:15 PDT 2009 NO_QUERY NesbyPhips So Im done editing &quot;The Phipstape&quot;. ... 4
905 0 1468035396 Mon Apr 06 23:22:24 PDT 2009 NO_QUERY noslennai Watching &quot;a league of their own&quot;...m... 4
979 0 1468052239 Mon Apr 06 23:27:43 PDT 2009 NO_QUERY ConstanceClark @MarcusMims wow i didn't get an &quot;hello&qu... 4
987 0 1468053198 Mon Apr 06 23:28:02 PDT 2009 NO_QUERY grinc Reading Buyology before bedtime... great premi... 4
990 0 1468053804 Mon Apr 06 23:28:13 PDT 2009 NO_QUERY princeralph omg... &quot;The Reader&quot; is making me 4
1044 0 1468068303 Mon Apr 06 23:32:33 PDT 2009 NO_QUERY rob_fitzpatrick @gracedent it's her &quot;hair&quot; I can't d... 4
1048 0 1468068726 Mon Apr 06 23:32:42 PDT 2009 NO_QUERY verruca Why isn't there a &quot;fake&quot; Verruca on ... 4
1071 0 1468074597 Mon Apr 06 23:34:32 PDT 2009 NO_QUERY PeterHC Used the term &quot;Fail Whale&quot; to a clie... 4
1086 0 1468076942 Mon Apr 06 23:35:15 PDT 2009 NO_QUERY geekpondering @transbay &quot;SFMTA Budget Proposal Hearing:... 4
1104 0 1468080524 Mon Apr 06 23:36:22 PDT 2009 NO_QUERY Msjackson88 still no &quot;followers&quot; please some1 ... 4
1204 0 1468107293 Mon Apr 06 23:45:12 PDT 2009 NO_QUERY norawatkins VIP guests today -________-&quot; blohheeee ... 4
... ... ... ... ... ... ... ...
1598559 4 2193190901 Tue Jun 16 08:07:11 PDT 2009 NO_QUERY MsSophieBelle @KempEquine nothing wrong Mom has one for me(... 4
1598567 4 2193220591 Tue Jun 16 08:09:39 PDT 2009 NO_QUERY katyhelena @o0omunkieo0o Thanks! I think so too. I call h... 4
1598655 4 2193224813 Tue Jun 16 08:10:00 PDT 2009 NO_QUERY JanayS &quot;i wake up it's a bad dream no one on my ... 4
1598668 4 2193251846 Tue Jun 16 08:12:11 PDT 2009 NO_QUERY RunWithForest @explosivityy I'm writing now. Di ko na sinas... 4
1598718 4 2193254120 Tue Jun 16 08:12:22 PDT 2009 NO_QUERY christicox I asked a 3yr old how old I was &amp; he said,... 4
1598749 4 2193255731 Tue Jun 16 08:12:31 PDT 2009 NO_QUERY jointuletz am plecat la &quot;la comedie&quot;...v-astept... 4
1598770 4 2193277168 Tue Jun 16 08:14:18 PDT 2009 NO_QUERY peachtweet @JKL_Katie omg, i know. it's annoying! they're... 4
1598938 4 2193306490 Tue Jun 16 08:16:41 PDT 2009 NO_QUERY Emm_Jay @Twyst That would be great... &quot;looking f... 4
1598969 4 2193319050 Tue Jun 16 08:17:43 PDT 2009 NO_QUERY howardsublett @dougalcorn: It is actually just @daveminor wa... 4
1599053 4 2193343042 Tue Jun 16 08:19:44 PDT 2009 NO_QUERY CardboxDiva @thejoshlynn You are! BTW send me an email. Wa... 4
1599188 4 2193371978 Tue Jun 16 08:22:04 PDT 2009 NO_QUERY cunderwo @meganlm i rented &quot;dead silence&quot; at ... 4
1599264 4 2193402232 Tue Jun 16 08:24:36 PDT 2009 NO_QUERY krystlerb @TANGG GT was a good movie...although I spent ... 4
1599337 4 2193426650 Tue Jun 16 08:26:37 PDT 2009 NO_QUERY LucasSchmitt @maryzlane that sucks. :/ i guess you have to ... 4
1599410 4 2193428898 Tue Jun 16 08:26:48 PDT 2009 NO_QUERY caileighamazing &quot;Wow, What A Tight Fit&quot; Lmao, Shutup. 4
1599418 4 2193429153 Tue Jun 16 08:26:49 PDT 2009 NO_QUERY erikalanzer19 @brendanlover11 I really love &quot;Starlight&... 4
1599424 4 2193450427 Tue Jun 16 08:28:32 PDT 2009 NO_QUERY HelloGracey Exploring the world of Twitter Listening to ... 4
1599460 4 2193452390 Tue Jun 16 08:28:42 PDT 2009 NO_QUERY comradephil @aw16 I must have skipped the &quot;pun&quot; ... 4
1599478 4 2193453100 Tue Jun 16 08:28:45 PDT 2009 NO_QUERY jgunzz Why don't we name tomorrow &quot;the Official ... 4
1599557 4 2193475600 Tue Jun 16 08:30:33 PDT 2009 NO_QUERY johnbertr @chinkchilla there's this brilliant add-on for... 4
1599612 4 2193478589 Tue Jun 16 08:30:47 PDT 2009 NO_QUERY teotarafas @InesVargas 17&quot;?! Hope you dont plan on t... 4
1599615 4 2193478782 Tue Jun 16 08:30:48 PDT 2009 NO_QUERY MilanTeh @brothrsaw The count would lead to shutter dea... 4
1599669 4 2193503005 Tue Jun 16 08:32:45 PDT 2009 NO_QUERY jtsosnowski You know there's way too much going on when yo... 4
1599682 4 2193503503 Tue Jun 16 08:32:48 PDT 2009 NO_QUERY oscarbarber @perequintana ara sí que ets tot un &quot;pir... 4
1599702 4 2193504328 Tue Jun 16 08:32:52 PDT 2009 NO_QUERY SweetTartelette @ang_w It's been forever since I have had &quo... 4
1599802 4 2193529779 Tue Jun 16 08:34:56 PDT 2009 NO_QUERY maeubhminor woooo elliot minor in 8 days!!! and i got my E... 4
1599835 4 2193551788 Tue Jun 16 08:36:44 PDT 2009 NO_QUERY annamadeleine @alexandervelky that's polite version - i only... 4
1599840 4 2193552024 Tue Jun 16 08:36:44 PDT 2009 NO_QUERY bdottie What a pretty day &quot;Just smile&quot; 4
1599946 4 2193577228 Tue Jun 16 08:38:49 PDT 2009 NO_QUERY Sirley @chriscuzzy someone wanted a &quot;Cuzzy&quot;... 4
1599977 4 2193578386 Tue Jun 16 08:38:55 PDT 2009 NO_QUERY TeamUKskyvixen @MayorDorisWolfe Thats my girl - dishing out t... 4
1599985 4 2193578982 Tue Jun 16 08:38:58 PDT 2009 NO_QUERY LISKFEST if ur the lead singer in a band, beware fallin... 4

33136 rows × 7 columns


In [133]:
df_[df_['class']==5]


Out[133]:
polarity tweetID date Query UserID text class
99 0 1467836859 Mon Apr 06 22:26:33 PDT 2009 NO_QUERY willy_chaz A bad nite for the favorite teams: Astros and ... 5
122 0 1467840016 Mon Apr 06 22:27:25 PDT 2009 NO_QUERY BustaBusta I know my life has been flipped upside down wh... 5
160 0 1467853356 Mon Apr 06 22:30:54 PDT 2009 NO_QUERY dbmendel Picked Mich St to win it all from the get go. ... 5
178 0 1467857297 Mon Apr 06 22:31:56 PDT 2009 NO_QUERY amanda5280 Today I realized I am too good at hiding thing... 5
192 0 1467859820 Mon Apr 06 22:32:36 PDT 2009 NO_QUERY msbutt3rfly14 spencer is not a good guy. 5
219 0 1467871040 Mon Apr 06 22:35:31 PDT 2009 NO_QUERY MTLarson1224 @DonnieWahlberg I hope i can make it to the au... 5
222 0 1467871545 Mon Apr 06 22:35:40 PDT 2009 NO_QUERY Cherye101 @PaulaAbdul awww, Good luck Paula!! Please don... 5
290 0 1467882902 Mon Apr 06 22:38:44 PDT 2009 NO_QUERY usagiko @LevenRambin: Take it easy, and be good to you. 5
298 0 1467889791 Mon Apr 06 22:40:33 PDT 2009 NO_QUERY jennhelvering Just called Hillsong again - they said they co... 5
320 0 1467894600 Mon Apr 06 22:41:51 PDT 2009 NO_QUERY dreaaa throat is closing up and i had some string che... 5
348 0 1467899025 Mon Apr 06 22:43:06 PDT 2009 NO_QUERY oup still sick. feeling a bit better, got some new... 5
441 0 1467924690 Mon Apr 06 22:50:17 PDT 2009 NO_QUERY FlyRice Good GOD they ruined my belly button!!! 5
444 0 1467925657 Mon Apr 06 22:50:34 PDT 2009 NO_QUERY aisyahsamsudin running nose + spinning head = not a good comb... 5
470 0 1467930341 Mon Apr 06 22:51:53 PDT 2009 NO_QUERY GemDoughnut MORNING!!! Good im bloody knackered!!! Work is... 5
475 0 1467931501 Mon Apr 06 22:52:13 PDT 2009 NO_QUERY soulonfire68 We've been good. I'm not liking the snow right... 5
508 0 1467937402 Mon Apr 06 22:53:55 PDT 2009 NO_QUERY haunter_ @Houndour ...i wish i was there...i'm pretty g... 5
532 0 1467947005 Mon Apr 06 22:56:40 PDT 2009 NO_QUERY tamisara Good morning! Ready 2 go, but I want 2 go back... 5
567 0 1467952123 Mon Apr 06 22:58:08 PDT 2009 NO_QUERY TurkishDelite @JonathanRKnight Good Knight hun! Looking forw... 5
603 0 1467964229 Mon Apr 06 23:01:26 PDT 2009 NO_QUERY farty_brando @greggrunberg hey you said matt was gonna go a... 5
653 0 1467979491 Mon Apr 06 23:05:45 PDT 2009 NO_QUERY TeresaUlring @PaulColes hmmm...greed is good when it motiv... 5
689 0 1467985114 Mon Apr 06 23:07:22 PDT 2009 NO_QUERY gladyschock not feeling v good abt myself 5
745 0 1467997236 Mon Apr 06 23:10:55 PDT 2009 NO_QUERY alyssaisrad916 UpdatingFFE. That gives me nothing to do for ... 5
746 0 1467997817 Mon Apr 06 23:11:03 PDT 2009 NO_QUERY bellalucia Hot compress not rily workng for pulled muscle... 5
782 0 1468004641 Mon Apr 06 23:12:59 PDT 2009 NO_QUERY kow_shik @vivekg86 Good to hear that we have support in... 5
791 0 1468006362 Mon Apr 06 23:13:30 PDT 2009 NO_QUERY woflln Ugh can't sleep. Wish i had a good cuddle to ... 5
869 0 1468030831 Mon Apr 06 23:20:55 PDT 2009 NO_QUERY StrAbZ good morning everybody! pkoi y fais pas beau ... 5
910 0 1468035840 Mon Apr 06 23:22:31 PDT 2009 NO_QUERY Catmoo @Sephystryx I've been looking about for good s... 5
1003 0 1468060375 Mon Apr 06 23:30:10 PDT 2009 NO_QUERY BrandesAsh 'study group extraordinare' about to leave cam... 5
1026 0 1468063973 Mon Apr 06 23:31:15 PDT 2009 NO_QUERY HilaryBays @BenPritchett goodness me, how did you find me... 5
1028 0 1468064339 Mon Apr 06 23:31:18 PDT 2009 NO_QUERY Jenoah1908 my poor little girl has a baaaad rash on her b... 5
... ... ... ... ... ... ... ...
1599486 4 2193453323 Tue Jun 16 08:28:46 PDT 2009 NO_QUERY janiecwales @fudgecrumpet Oh good, it will be good to get ... 5
1599511 4 2193454228 Tue Jun 16 08:28:51 PDT 2009 NO_QUERY panicxx Good cinema exam!! 5
1599527 4 2193474286 Tue Jun 16 08:30:27 PDT 2009 NO_QUERY siobhandemers Good Morning!! I'm making cookies today 5
1599546 4 2193475013 Tue Jun 16 08:30:30 PDT 2009 NO_QUERY nataliefyffe I totally rocked drunken Rockband! Nothin' bet... 5
1599589 4 2193477152 Tue Jun 16 08:30:40 PDT 2009 NO_QUERY jhollenbach @MikeRuocco good luck 5
1599592 4 2193477292 Tue Jun 16 08:30:41 PDT 2009 NO_QUERY kenyonknight Hmm I dreamed that I woke up in good ol'New Ha... 5
1599609 4 2193478306 Tue Jun 16 08:30:46 PDT 2009 NO_QUERY loyalwolf06 is SO excited for The Fray concert today!!!!!!... 5
1599614 4 2193478763 Tue Jun 16 08:30:48 PDT 2009 NO_QUERY Nelinski_254 #mw2 Click on more from this user.. and rate m... 5
1599629 4 2193501365 Tue Jun 16 08:32:37 PDT 2009 NO_QUERY pianogeek Good morning everyone! And to my fabulous UK ... 5
1599642 4 2193501998 Tue Jun 16 08:32:40 PDT 2009 NO_QUERY PaulHarriott @hype6477 Good to hear all's cool. I've just s... 5
1599656 4 2193502444 Tue Jun 16 08:32:42 PDT 2009 NO_QUERY ireneagh @pinkpeony_etsy you have a lovely shop on etsy... 5
1599660 4 2193502743 Tue Jun 16 08:32:44 PDT 2009 NO_QUERY Ritter33 rain is good 5
1599690 4 2193503753 Tue Jun 16 08:32:49 PDT 2009 NO_QUERY mommyperks @Travelwthemagic Good morning, here. Still jus... 5
1599762 4 2193527876 Tue Jun 16 08:34:47 PDT 2009 NO_QUERY MKeithHarris @denniswords good times indeed 5
1599768 4 2193528075 Tue Jun 16 08:34:48 PDT 2009 NO_QUERY rach3lizabeth Good morning everyone. 5
1599782 4 2193528659 Tue Jun 16 08:34:51 PDT 2009 NO_QUERY annaasaywhat http://bln.kr/T1/ good song (Y) 5
1599821 4 2193551359 Tue Jun 16 08:36:41 PDT 2009 NO_QUERY HildeM_EN @Piewacket1 good sometimes a shot attention ... 5
1599825 4 2193551473 Tue Jun 16 08:36:42 PDT 2009 NO_QUERY james_bertram @tonyhawk Yess! Good choice of laptop there, t... 5
1599827 4 2193551560 Tue Jun 16 08:36:42 PDT 2009 NO_QUERY JadoreConcierge Good morning Chicago, I love this city! 5
1599828 4 2193551571 Tue Jun 16 08:36:42 PDT 2009 NO_QUERY crside @dfinchalicious It ends up like that when goo... 5
1599841 4 2193552033 Tue Jun 16 08:36:44 PDT 2009 NO_QUERY josephranseth Good morning everyone! I hope you take some ti... 5
1599850 4 2193552448 Tue Jun 16 08:36:46 PDT 2009 NO_QUERY jenniehager sick, sick, sick today, but still fingers cros... 5
1599856 4 2193552668 Tue Jun 16 08:36:48 PDT 2009 NO_QUERY CintiaXimena taking a twit break...a lil blown about my bos... 5
1599869 4 2193553340 Tue Jun 16 08:36:51 PDT 2009 NO_QUERY nirvannah @carahsollins Good Morning Lady 5
1599875 4 2193553559 Tue Jun 16 08:36:52 PDT 2009 NO_QUERY MichiTheReal @ashleytisdale hih good morning in austria it... 5
1599907 4 2193575210 Tue Jun 16 08:38:39 PDT 2009 NO_QUERY themunny @gabespears morning 5
1599916 4 2193575737 Tue Jun 16 08:38:41 PDT 2009 NO_QUERY MzGr33n4ppL3 Hello good morning! i wanna do something today. 5
1599937 4 2193576655 Tue Jun 16 08:38:46 PDT 2009 NO_QUERY eratyptin @siahoney I am good thanks! How is #Eric, I... 5
1599988 4 2193579191 Tue Jun 16 08:38:59 PDT 2009 NO_QUERY tellman @Roy_Everitt ha- good job. that's right - we g... 5
1599994 4 2193579489 Tue Jun 16 08:39:00 PDT 2009 NO_QUERY EvolveTom @Cliff_Forster Yeah, that does work better tha... 5

67484 rows × 7 columns


In [ ]:


In [103]:
df_[df_['UserID']=='usagiko']


Out[103]:
polarity tweetID date Query UserID text class
290 0 1467882902 Mon Apr 06 22:38:44 PDT 2009 NO_QUERY usagiko @LevenRambin: Take it easy, and be good to you. 5
121160 0 1833443461 Sun May 17 23:18:10 PDT 2009 NO_QUERY usagiko @danielthomsen: I hope you don't get the hamth... 7
122488 0 1833720820 Mon May 18 00:13:28 PDT 2009 NO_QUERY usagiko @rocketgirl13: I'd hug you but I'd be hugging ... 0
132187 0 1835698381 Mon May 18 06:33:52 PDT 2009 NO_QUERY usagiko Why am I awake? 0
188985 0 1969000232 Fri May 29 22:45:38 PDT 2009 NO_QUERY usagiko @aelysian: we're too popular for our own good,... 5
231465 0 1978970242 Sun May 31 01:07:24 PDT 2009 NO_QUERY usagiko @churunga: I still have your gifts to send. I ... 7
258601 0 1985229042 Sun May 31 16:41:30 PDT 2009 NO_QUERY usagiko @roanapur: oh honey. I'm so sorry. 7
379545 0 2052364361 Sat Jun 06 00:29:06 PDT 2009 NO_QUERY usagiko @trifluorides: I wish I could use those. But b... 7
380562 0 2052591055 Sat Jun 06 01:16:00 PDT 2009 NO_QUERY usagiko @DarthRyu666: I got the raw, since I'm hardcor... 7
411700 0 2060181181 Sat Jun 06 18:25:00 PDT 2009 NO_QUERY usagiko @kiptripsyc: I want the new forme of pokemans. 4
415956 0 2061190664 Sat Jun 06 20:21:43 PDT 2009 NO_QUERY usagiko @imbrifer: YAYYYY! Also, it'll probably take s... 7
445539 0 2068152200 Sun Jun 07 13:35:19 PDT 2009 NO_QUERY usagiko So, depending on what the diagnosis is, we may... 0
446967 0 2068569121 Sun Jun 07 14:18:30 PDT 2009 NO_QUERY usagiko @kiddetective: oh honey. 0
529910 0 2195692030 Tue Jun 16 11:28:29 PDT 2009 NO_QUERY usagiko @hadleyk: In a &quot;Doctor Who&quot; kinda wa... 4
675244 0 2248034283 Fri Jun 19 20:00:09 PDT 2009 NO_QUERY usagiko @LevenRambin: They're making her do what? That... 7
676006 0 2248270018 Fri Jun 19 20:21:41 PDT 2009 NO_QUERY usagiko @sampo_ilmari http://twitpic.com/7vaq8 - You l... 1
688126 0 2251447717 Sat Jun 20 03:32:10 PDT 2009 NO_QUERY usagiko @finding_jay: I know. You had a very early sta... 7
688951 0 2251644443 Sat Jun 20 04:09:05 PDT 2009 NO_QUERY usagiko @finding_jay: My bb. [gives you massage] 0
719533 0 2260622542 Sat Jun 20 19:30:53 PDT 2009 NO_QUERY usagiko Found aloe, so itchy. 0
799748 0 2329114201 Thu Jun 25 10:21:59 PDT 2009 NO_QUERY usagiko @finding_jay: I'm sorry I wasn't there, bb. F... 0
938726 4 1793524597 Thu May 14 03:01:32 PDT 2009 NO_QUERY usagiko @LevenRambin: pics or it didn't happen, dude. ... 0
1155596 4 1979023887 Sun May 31 01:19:57 PDT 2009 NO_QUERY usagiko @churunga: Also, do you liek Spock? 0
1265419 4 1999438146 Mon Jun 01 20:13:04 PDT 2009 NO_QUERY usagiko @socalanon: are we getting an award? 0

In [130]:
classes = {}
for user in df_['UserID'].unique():
    vals = {}
    for i in df_[df_['UserID']==user]['class']:
        if i!=1:
            if i in vals:
                vals[i]+=1
            else:
                vals[i]=1

    try:
        classes[user] = max(vals, key=stats.get)
    except ValueError:
        classes[user] = 1


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-130-d8b44f498784> in <module>()
      2 for user in df_['UserID'].unique():
      3     vals = {}
----> 4     for i in df_[df_['UserID']==user]['class']:
      5         if i!=1:
      6             if i in vals:

/Users/km/anaconda/lib/python2.7/site-packages/pandas/core/ops.pyc in wrapper(self, other, axis)
    853 
    854             with np.errstate(all='ignore'):
--> 855                 res = na_op(values, other)
    856             if isscalar(res):
    857                 raise TypeError('Could not compare %s type with Series' %

/Users/km/anaconda/lib/python2.7/site-packages/pandas/core/ops.pyc in na_op(x, y)
    757 
    758         if is_object_dtype(x.dtype):
--> 759             result = _comp_method_OBJECT_ARRAY(op, x, y)
    760         else:
    761 

/Users/km/anaconda/lib/python2.7/site-packages/pandas/core/ops.pyc in _comp_method_OBJECT_ARRAY(op, x, y)
    737         result = lib.vec_compare(x, y, op)
    738     else:
--> 739         result = lib.scalar_compare(x, y, op)
    740     return result
    741 

KeyboardInterrupt: 

In [ ]:
import statistics
from statistics import StatisticsError
import random
import math


# colors = ['#d7191c', '#ffffbf', '#2b83ba', 'green','orange','maroon','black']
user_class = []
for g in core7:
    try:
        try:
            Id_Pred = df_[df_['UserID']==user]['class']
            X = statistics.mode(ID_Pred[g])
            node_colors.append(colors[X])
        except StatisticsError:
            node_colors.append(colors[ID_Pred[g][random.randint(0,len(ID_Pred[g])-1)]])
    except KeyError:
        node_colors.append(colors[6])

In [112]:
# df_[df_['UserID']=='ForzaRagazza']
vals = {}
for i in df_[df_['UserID']=='usagiko']['class']:
    if i!=1:
        if i in vals:
            vals[i]+=1
        else:
            vals[i]=1

max(vals, key=stats.get)


Out[112]:
0

In [100]:
stats = {'a':2,'b':4,'c':1}
max(stats, key=stats.get)


Out[100]:
'b'

In [97]:
df_


Out[97]:
polarity tweetID date Query UserID text class
0 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY _TheSpecialOne_ @switchfoot http://twitpic.com/2y1zl - Awww, t... 6
1 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton is upset that he can't update his Facebook by ... 7
2 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus @Kenichan I dived many times for the ball. Man... 0
3 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF my whole body feels itchy and like its on fire 7
4 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli @nationwideclass no, it's not behaving at all.... 0
5 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY joy_wolf @Kwesidei not the whole crew 0
6 0 1467811592 Mon Apr 06 22:20:03 PDT 2009 NO_QUERY mybirch Need a hug 0
7 0 1467811594 Mon Apr 06 22:20:03 PDT 2009 NO_QUERY coZZ @LOLTrish hey long time no see! Yes.. Rains a... 4
8 0 1467811795 Mon Apr 06 22:20:05 PDT 2009 NO_QUERY 2Hood4Hollywood @Tatiana_K nope they didn't have it 0
9 0 1467812025 Mon Apr 06 22:20:09 PDT 2009 NO_QUERY mimismo @twittera que me muera ? 0
10 0 1467812416 Mon Apr 06 22:20:16 PDT 2009 NO_QUERY erinx3leannexo spring break in plain city... it's snowing 0
11 0 1467812579 Mon Apr 06 22:20:17 PDT 2009 NO_QUERY pardonlauren I just re-pierced my ears 7
12 0 1467812723 Mon Apr 06 22:20:19 PDT 2009 NO_QUERY TLeC @caregiving I couldn't bear to watch it. And ... 0
13 0 1467812771 Mon Apr 06 22:20:19 PDT 2009 NO_QUERY robrobbierobert @octolinz16 It it counts, idk why I did either... 0
14 0 1467812784 Mon Apr 06 22:20:20 PDT 2009 NO_QUERY bayofwolves @smarrison i would've been the first, but i di... 7
15 0 1467812799 Mon Apr 06 22:20:20 PDT 2009 NO_QUERY HairByJess @iamjazzyfizzle I wish I got to watch it with ... 4
16 0 1467812964 Mon Apr 06 22:20:22 PDT 2009 NO_QUERY lovesongwriter Hollis' death scene will hurt me severely to w... 0
17 0 1467813137 Mon Apr 06 22:20:25 PDT 2009 NO_QUERY armotley about to file taxes 0
18 0 1467813579 Mon Apr 06 22:20:31 PDT 2009 NO_QUERY starkissed @LettyA ahh ive always wanted to see rent lov... 0
19 0 1467813782 Mon Apr 06 22:20:34 PDT 2009 NO_QUERY gi_gi_bee @FakerPattyPattz Oh dear. Were you drinking ou... 0
20 0 1467813985 Mon Apr 06 22:20:37 PDT 2009 NO_QUERY quanvu @alydesigns i was out most of the day so didn'... 4
21 0 1467813992 Mon Apr 06 22:20:38 PDT 2009 NO_QUERY swinspeedx one of my friend called me, and asked to meet ... 7
22 0 1467814119 Mon Apr 06 22:20:40 PDT 2009 NO_QUERY cooliodoc @angry_barista I baked you a cake but I ated it 0
23 0 1467814180 Mon Apr 06 22:20:40 PDT 2009 NO_QUERY viJILLante this week is not going as i had hoped 7
24 0 1467814192 Mon Apr 06 22:20:41 PDT 2009 NO_QUERY Ljelli3166 blagh class at 8 tomorrow 0
25 0 1467814438 Mon Apr 06 22:20:44 PDT 2009 NO_QUERY ChicagoCubbie I hate when I have to call and wake people up 7
26 0 1467814783 Mon Apr 06 22:20:50 PDT 2009 NO_QUERY KatieAngell Just going to cry myself to sleep after watchi... 4
27 0 1467814883 Mon Apr 06 22:20:52 PDT 2009 NO_QUERY gagoo im sad now Miss.Lilly 7
28 0 1467815199 Mon Apr 06 22:20:56 PDT 2009 NO_QUERY abel209 ooooh.... LOL that leslie.... and ok I won't ... 0
29 0 1467815753 Mon Apr 06 22:21:04 PDT 2009 NO_QUERY BaptisteTheFool Meh... Almost Lover is the exception... this t... 0
... ... ... ... ... ... ... ...
1599970 4 2193578196 Tue Jun 16 08:38:54 PDT 2009 NO_QUERY adbillingsley Thanks @eastwestchic &amp; @wangyip Thanks! Th... 4
1599971 4 2193578237 Tue Jun 16 08:38:54 PDT 2009 NO_QUERY gekkko @marttn thanks Martin. not the most imaginativ... 0
1599972 4 2193578269 Tue Jun 16 08:38:54 PDT 2009 NO_QUERY millerslab @MikeJonesPhoto Congrats Mike Way to go! 0
1599973 4 2193578319 Tue Jun 16 08:38:55 PDT 2009 NO_QUERY luckygeorgeblog http://twitpic.com/7jp4n - OMG! Office Space..... 1
1599974 4 2193578345 Tue Jun 16 08:38:55 PDT 2009 NO_QUERY Kristah_Diggs @yrclndstnlvr ahaha nooo you were just away fr... 7
1599975 4 2193578347 Tue Jun 16 08:38:55 PDT 2009 NO_QUERY CoachChic @BizCoachDeb Hey, I'm baack! And, thanks so m... 0
1599976 4 2193578348 Tue Jun 16 08:38:55 PDT 2009 NO_QUERY serianna @mattycus Yeah, my conscience would be clear i... 0
1599977 4 2193578386 Tue Jun 16 08:38:55 PDT 2009 NO_QUERY TeamUKskyvixen @MayorDorisWolfe Thats my girl - dishing out t... 7
1599978 4 2193578395 Tue Jun 16 08:38:55 PDT 2009 NO_QUERY LaurenMoo10 @shebbs123 i second that 0
1599979 4 2193578576 Tue Jun 16 08:38:57 PDT 2009 NO_QUERY angel_sammy04 In the garden 0
1599980 4 2193578679 Tue Jun 16 08:38:56 PDT 2009 NO_QUERY puchal_ek @myheartandmind jo jen by nemuselo zrovna té ... 0
1599981 4 2193578716 Tue Jun 16 08:38:57 PDT 2009 NO_QUERY youtubelatest Another Commenting Contest! [;: Yay!!! http:/... 1
1599982 4 2193578739 Tue Jun 16 08:38:57 PDT 2009 NO_QUERY Mandi_Davenport @thrillmesoon i figured out how to see my twee... 0
1599983 4 2193578758 Tue Jun 16 08:38:57 PDT 2009 NO_QUERY xoAurixo @oxhot theri tomorrow, drinking coffee, talkin... 7
1599984 4 2193578847 Tue Jun 16 08:38:57 PDT 2009 NO_QUERY RobFoxKerr You heard it here first -- We're having a girl... 0
1599985 4 2193578982 Tue Jun 16 08:38:58 PDT 2009 NO_QUERY LISKFEST if ur the lead singer in a band, beware fallin... 6
1599986 4 2193579087 Tue Jun 16 08:38:58 PDT 2009 NO_QUERY marhgil @tarayqueen too much ads on my blog. 0
1599987 4 2193579092 Tue Jun 16 08:38:58 PDT 2009 NO_QUERY cathriiin @La_r_a NEVEER I think that you both will get... 7
1599988 4 2193579191 Tue Jun 16 08:38:59 PDT 2009 NO_QUERY tellman @Roy_Everitt ha- good job. that's right - we g... 5
1599989 4 2193579211 Tue Jun 16 08:38:59 PDT 2009 NO_QUERY jazzstixx @Ms_Hip_Hop im glad ur doing well 7
1599990 4 2193579249 Tue Jun 16 08:38:59 PDT 2009 NO_QUERY razzberry5594 WOOOOO! Xbox is back 0
1599991 4 2193579284 Tue Jun 16 08:38:59 PDT 2009 NO_QUERY AgustinaP @rmedina @LaTati Mmmm That sounds absolutely ... 7
1599992 4 2193579434 Tue Jun 16 08:39:00 PDT 2009 NO_QUERY sdancingsteph ReCoVeRiNg FrOm ThE lOnG wEeKeNd 0
1599993 4 2193579477 Tue Jun 16 08:39:00 PDT 2009 NO_QUERY ChloeAmisha @SCOOBY_GRITBOYS 0
1599994 4 2193579489 Tue Jun 16 08:39:00 PDT 2009 NO_QUERY EvolveTom @Cliff_Forster Yeah, that does work better tha... 4
1599995 4 2193601966 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY AmandaMarie1028 Just woke up. Having no school is the best fee... 4
1599996 4 2193601969 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY TheWDBoards TheWDB.com - Very cool to hear old Walt interv... 6
1599997 4 2193601991 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY bpbabe Are you ready for your MoJo Makeover? Ask me f... 0
1599998 4 2193602064 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY tinydiamondz Happy 38th Birthday to my boo of alll time!!! ... 7
1599999 4 2193602129 Tue Jun 16 08:40:50 PDT 2009 NO_QUERY RyanTrevMorris happy #charitytuesday @theNSPCC @SparksCharity... 0

1600000 rows × 7 columns


In [95]:
dfff = pd.DataFrame({'A': [1, 2, 1, 2, 1, 2, 3],'B': [5, 6, 5, 5, 5, 7, 6]})
# df.mode()

In [73]:
# df.columns#
df[u'UserID'].unique()


Out[73]:
array([u'_TheSpecialOne_', u'scotthamilton', u'mattycus', ...,
       u'EvolveTom', u'AmandaMarie1028', u'bpbabe'], dtype=object)

In [66]:
#Kmeans

# Clustering with some parameters.

from sklearn.cluster import KMeans
ncl = 8
k = 30
kmeans_30 = KMeans(n_clusters=ncl, init='k-means++', max_iter=100, n_init=10,random_state=0)
y_30 = kmeans_30.fit_predict(X[:,:30])
# centroids = kmeans.cluster_centers_
# labels = kmeans.labels_
# error = kmeans.inertia_
y_30


Out[66]:
array([0, 1, 1, ..., 1, 1, 1], dtype=int32)

In [ ]:
# # GG=nx.Graph() # so that we dont destroy G if we start running this cell

# m=0
# n=0

# for index, row in df_.iterrows():
#     G.add_node(row[4])
#     if '@' in row[5]:
#         m+=1
#         for t in re.split('[^a-zA-Z\_\@]', row[5]):
#             if t!='' and t[0]=='@':
#                 G.add_edge(row[4],t[1:])
#                 n+=1

In [127]:
colors = ['b','w','r','g','c','m','y','k']
# g : green.
# r : red.
# c : cyan.
# m : magenta.
# y : yellow.
# k : black.
# w : white.

In [125]:
i=0
labeled_nodes = []
for x in core7.nodes():
    labeled_nodes.append( (x, 1) )
    try:
        labeled_nodes[i]=( (x,classes[x]) )
    except KeyError:
        pass
    i+=1

In [ ]:
labeled

In [129]:
fig = plt.figure(figsize=(12,6))
ax = plt.subplot(111)

# labeled_nodes = [(x,classes[x]) for x in core7.nodes()]
    
# which = np.random.choice(range(len(labeled_nodes)),500)
which = range(len(core7))
# mini_g = core7.subgraph([labeled_nodes[i][0] for i in which])
mini_g = core7
node_colors = [colors[labeled_nodes[i][1]] for i in which]

nx.draw(mini_g, node_color=node_colors,node_size=100, ax=ax, with_labels='False',
       alpha =0.2, font_size=0,width=0.1)
# nx.draw(Gc_core,nodelist=Gc_core.nodes()[:100], node_color=node_colors,node_size=100, ax=ax, with_labels='False',
#        alpha =0.2, font_size=0,width=0.1)
# nx.draw(Gc_core, node_color=node_colors,node_size=10, ax=ax, with_labels='True', font_size=16)



In [74]:
type(G.nodes()[0])


Out[74]:
str

In [79]:
G.nodes()[1]


Out[79]:
'elmoberry'

In [107]:
list(y_30).count(0)


Out[107]:
52429

In [108]:
list(y_30).count(1)


Out[108]:
1247302

In [110]:
for i in xrange(ncl):
    print list(y_30).count(i)


52429
1247302
43656
32481
33136
67484
52836
70676

In [111]:
# #Kmeans

# # Clustering with some parameters.

# from sklearn.cluster import KMeans
# ncl = 20
# k = 10
# kmeans_10_100 = KMeans(n_clusters=ncl, init='k-means++', max_iter=100, n_init=10,random_state=0)
# y_10_100 = kmeans_10_100.fit_predict(X[:,:10])
# # centroids = kmeans.cluster_centers_
# # labels = kmeans.labels_
# # error = kmeans.inertia_
# y_10_100


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-111-dc0405c5162d> in <module>()
      7 k = 10
      8 kmeans_10_100 = KMeans(n_clusters=ncl, init='k-means++', max_iter=100, n_init=10,random_state=0)
----> 9 y_10_100 = kmeans_10_100.fit_predict(X[:,:10])
     10 # centroids = kmeans.cluster_centers_
     11 # labels = kmeans.labels_

/Users/km/anaconda/lib/python2.7/site-packages/sklearn/cluster/k_means_.pyc in fit_predict(self, X, y)
    828         predict(X).
    829         """
--> 830         return self.fit(X).labels_
    831 
    832     def fit_transform(self, X, y=None):

/Users/km/anaconda/lib/python2.7/site-packages/sklearn/cluster/k_means_.pyc in fit(self, X, y)
    819                 precompute_distances=self.precompute_distances,
    820                 tol=self.tol, random_state=random_state, copy_x=self.copy_x,
--> 821                 n_jobs=self.n_jobs)
    822         return self
    823 

/Users/km/anaconda/lib/python2.7/site-packages/sklearn/cluster/k_means_.pyc in k_means(X, n_clusters, init, precompute_distances, n_init, max_iter, verbose, tol, random_state, copy_x, n_jobs, return_n_iter)
    322                 X, n_clusters, max_iter=max_iter, init=init, verbose=verbose,
    323                 precompute_distances=precompute_distances, tol=tol,
--> 324                 x_squared_norms=x_squared_norms, random_state=random_state)
    325             # determine if these results are the best so far
    326             if best_inertia is None or inertia < best_inertia:

/Users/km/anaconda/lib/python2.7/site-packages/sklearn/cluster/k_means_.pyc in _kmeans_single(X, n_clusters, x_squared_norms, max_iter, init, verbose, random_state, tol, precompute_distances)
    445             _labels_inertia(X, x_squared_norms, centers,
    446                             precompute_distances=precompute_distances,
--> 447                             distances=distances)
    448 
    449         # computation of the means is also called the M-step of EM

/Users/km/anaconda/lib/python2.7/site-packages/sklearn/cluster/k_means_.pyc in _labels_inertia(X, x_squared_norms, centers, precompute_distances, distances)
    576                                                     centers, distances)
    577         inertia = _k_means._assign_labels_array(
--> 578             X, x_squared_norms, centers, labels, distances=distances)
    579     return labels, inertia
    580 

KeyboardInterrupt: 

In [ ]:
for i in xrange(100):
    if pred[i]==1:
        print TextList[i]