In [6]:
from preprocess import *
from feature_extraction import *
from lsi import *
from kmeans import *
import CMUTweetTagger


def get_feature_sets(filename):

    df = preprocess(filename)
    print "length of data", len(df)

    data_sample = df['text'].str.lower()

    #TFIDF features without text processing
    [data_fs2, vectorizer, no_features] = vectorize(data_sample, TFIDF) #feature set 2
    #Unigram features without text processing
    [data_fs1, vectorizer, no_features] = vectorize(data_sample, UNI) #Feature set 1

    #Text preprocessing - stopwords, stemming, lowercase
    data_fs3 = tokenize_and_stopwords(data_sample)
    data_fs3 = stemmer(data_fs3)
    #use CMU tagger and remove NNP and NNPS
    
    print "CMU tagger"
    all_tags = CMUTweetTagger.runtagger_parse(data_fs3)
    for i in range(len(all_tags)):
        for tag in all_tags[i]:
#            print tag[1]
            if tag[1] == 'NNP' or tag[1] == 'NNPS':
                data_fs3[i] = data_fs3[i].replace(tag[0], '')  

    [data_fs3, vectorizer, no_features] = vectorize(data_fs3, TFIDF) #Feature set 3
    data_fs4 = lsa(data_fs3) #feature set 4 
    print data_fs1.shape
    print data_fs2.shape
    print data_fs3.shape
    print data_fs4.shape
    
    return [data_fs1, data_fs2, data_fs3, data_fs4, df]

In [7]:
def kmeans_analysis(filename):    
    no_clusters = 5
    filename = "clinton-50k.csv"
    [data_fs1, data_fs2, data_fs3, data_fs4, df1] = get_feature_sets(filename)
    df  = df1[['tweet_id','text', 'retweets']].copy()
    df['cluster_fs1'] = run_kmeans(data_fs1, 7)
    df['cluster_fs2'] = run_kmeans(data_fs2, 7)
    df['cluster_fs3'] = run_kmeans(data_fs3, 7)
    df['cluster_fs4'] = run_kmeans(data_fs4, 7)
    result_filename = filename.replace(".csv", "") +"-test.csv"
    df.to_csv(result_filename)
    return df

In [8]:
def perform_analysis(df):
    print df.groupby(['cluster_fs1']).describe()
    print df.groupby(['cluster_fs2']).describe()
    print df.groupby(['cluster_fs3']).describe()
    print df.groupby(['cluster_fs4']).describe()
    df.corr()
    result1 = df.sort(['cluster_fs1'])
    result2 = df.sort(['cluster_fs2'])
    result3 = df.sort(['cluster_fs3'])
    result4 = df.sort(['cluster_fs4'])
    result4

In [9]:
def main():
    f1 = "trump-50k.csv"
    f2 = "clinton-50k.csv"
    df1 = kmeans_analysis(f1)
    perform_analysis(df1)
    df2 = kmeans_analysis(f2)
    perform_analysis(df2)

In [10]:
main()


length of data 50000
stemming 
CMU tagger
Running LSA
(50000, 100)
(50000, 100)
(50000, 100)
(50000, 10)
                    cluster_fs2   cluster_fs3   cluster_fs4      retweets  \
cluster_fs1                                                                 
0           count   6737.000000   6737.000000   6737.000000   6737.000000   
            mean       1.769779      1.696007      1.990055     21.885706   
            std        2.071550      2.149799      1.811759    386.541178   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        0.000000      0.000000      1.000000      0.000000   
            50%        1.000000      1.000000      1.000000      0.000000   
            75%        3.000000      3.000000      4.000000      2.000000   
            max        6.000000      6.000000      6.000000  27137.000000   
1           count   7896.000000   7896.000000   7896.000000   7896.000000   
            mean       5.291920      1.499113      2.004813     18.976570   
            std        1.562615      2.015671      1.729600    187.945780   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        6.000000      0.000000      1.000000      0.000000   
            50%        6.000000      0.000000      1.000000      0.000000   
            75%        6.000000      2.000000      3.000000      2.000000   
            max        6.000000      6.000000      6.000000   9557.000000   
2           count   6732.000000   6732.000000   6732.000000   6732.000000   
            mean       0.886958      1.985888      2.346554      9.112894   
            std        1.848126      1.190334      1.030649    108.015872   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        0.000000      2.000000      1.000000      0.000000   
            50%        0.000000      2.000000      3.000000      0.000000   
            75%        1.000000      2.000000      3.000000      1.000000   
            max        6.000000      6.000000      6.000000   5192.000000   
3           count   6716.000000   6716.000000   6716.000000   6716.000000   
            mean       1.995831      1.255360      1.793627     10.963371   
            std        1.069986      1.862743      1.512029    115.656744   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        2.000000      0.000000      1.000000      0.000000   
            50%        2.000000      0.000000      1.000000      0.000000   
            75%        2.000000      2.000000      3.000000      1.000000   
            max        6.000000      6.000000      6.000000   5812.000000   
4           count  12416.000000  12416.000000  12416.000000  12416.000000   
            mean       1.165432      1.333682      1.962629      0.861147   
            std        1.816148      1.961621      1.668741      7.339446   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        0.000000      0.000000      1.000000      0.000000   
            50%        0.000000      0.000000      1.000000      0.000000   
            75%        3.000000      3.000000      4.000000      0.000000   
            max        5.000000      6.000000      6.000000    303.000000   
5           count   1225.000000   1225.000000   1225.000000   1225.000000   
            mean       4.000000      2.000000      2.000000      0.043265   
            std        0.000000      0.000000      0.000000      0.355520   
            min        4.000000      2.000000      2.000000      0.000000   
            25%        4.000000      2.000000      2.000000      0.000000   
            50%        4.000000      2.000000      2.000000      0.000000   
            75%        4.000000      2.000000      2.000000      0.000000   
            max        4.000000      2.000000      2.000000      6.000000   
6           count   8278.000000   8278.000000   8278.000000   8278.000000   
            mean       1.232786      2.927036      3.494443     47.158734   
            std        0.925145      2.318769      2.348681    458.825604   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        1.000000      0.000000      1.000000      1.000000   
            50%        1.000000      3.000000      4.000000      2.000000   
            75%        2.000000      5.000000      6.000000      9.000000   
            max        3.000000      6.000000      6.000000  31783.000000   

                       tweet_id  
cluster_fs1                      
0           count  6.737000e+03  
            mean   1.801443e+06  
            std    7.058985e+05  
            min    3.163570e+05  
            25%    1.195259e+06  
            50%    2.042541e+06  
            75%    2.377391e+06  
            max    2.678732e+06  
1           count  7.896000e+03  
            mean   1.705424e+06  
            std    7.523128e+05  
            min    3.163150e+05  
            25%    9.946095e+05  
            50%    2.007374e+06  
            75%    2.360434e+06  
            max    2.678850e+06  
2           count  6.732000e+03  
            mean   1.158924e+06  
            std    8.316978e+05  
            min    3.163160e+05  
            25%    3.434955e+05  
            50%    7.048105e+05  
            75%    2.031512e+06  
            max    2.678890e+06  
3           count  6.716000e+03  
            mean   1.743311e+06  
            std    7.460642e+05  
            min    3.163110e+05  
            25%    1.001803e+06  
            50%    2.030056e+06  
            75%    2.376600e+06  
            max    2.678886e+06  
4           count  1.241600e+04  
            mean   1.842138e+06  
            std    6.024604e+05  
            min    3.163520e+05  
            25%    1.631508e+06  
            50%    2.002042e+06  
            75%    2.365838e+06  
            max    2.678849e+06  
5           count  1.225000e+03  
            mean   1.639788e+06  
            std    2.078353e+04  
            min    1.604422e+06  
            25%    1.632814e+06  
            50%    1.637839e+06  
            75%    1.643534e+06  
            max    1.680809e+06  
6           count  8.278000e+03  
            mean   1.731183e+06  
            std    6.952790e+05  
            min    3.163310e+05  
            25%    1.066951e+06  
            50%    1.833227e+06  
            75%    2.357281e+06  
            max    2.678810e+06  
                    cluster_fs1   cluster_fs3   cluster_fs4      retweets  \
cluster_fs2                                                                 
0           count  19163.000000  19163.000000  19163.000000  19163.000000   
            mean       2.927203      1.394041      1.411418      9.667641   
            std        1.707712      1.975340      1.186644    158.210595   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        2.000000      0.000000      1.000000      0.000000   
            50%        4.000000      0.000000      1.000000      0.000000   
            75%        4.000000      2.000000      1.000000      1.000000   
            max        6.000000      6.000000      6.000000  14819.000000   
1           count   5565.000000   5565.000000   5565.000000   5565.000000   
            mean       5.044744      3.463971      3.857143     40.458041   
            std        1.990296      2.138543      2.348095    481.668886   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        6.000000      2.000000      1.000000      1.000000   
            50%        6.000000      5.000000      6.000000      2.000000   
            75%        6.000000      5.000000      6.000000      8.000000   
            max        6.000000      6.000000      6.000000  31783.000000   
2           count   7760.000000   7760.000000   7760.000000   7760.000000   
            mean       2.845232      1.633634      1.971521     20.356959   
            std        1.617415      2.078745      1.748114    339.125413   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        3.000000      0.000000      1.000000      0.000000   
            50%        3.000000      0.000000      1.000000      0.000000   
            75%        3.000000      3.000000      3.000000      2.000000   
            max        6.000000      6.000000      6.000000  27137.000000   
3           count   6535.000000   6535.000000   6535.000000   6535.000000   
            mean       3.109870      1.236725      3.726549     13.790360   
            std        2.050760      1.026078      0.931223    155.472062   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        1.000000      1.000000      4.000000      0.000000   
            50%        4.000000      1.000000      4.000000      0.000000   
            75%        4.000000      1.000000      4.000000      2.000000   
            max        6.000000      6.000000      6.000000   7649.000000   
4           count   1227.000000   1227.000000   1227.000000   1227.000000   
            mean       4.996740      1.999185      1.999185      0.049715   
            std        0.090255      0.028548      0.028548      0.421648   
            min        2.000000      1.000000      1.000000      0.000000   
            25%        5.000000      2.000000      2.000000      0.000000   
            50%        5.000000      2.000000      2.000000      0.000000   
            75%        5.000000      2.000000      2.000000      0.000000   
            max        5.000000      2.000000      2.000000      8.000000   
5           count   1440.000000   1440.000000   1440.000000   1440.000000   
            mean       4.000000      4.000000      5.000000      0.007639   
            std        0.000000      0.000000      0.000000      0.108422   
            min        4.000000      4.000000      5.000000      0.000000   
            25%        4.000000      4.000000      5.000000      0.000000   
            50%        4.000000      4.000000      5.000000      0.000000   
            75%        4.000000      4.000000      5.000000      0.000000   
            max        4.000000      4.000000      5.000000      3.000000   
6           count   8310.000000   8310.000000   8310.000000   8310.000000   
            mean       1.023345      1.605174      1.806017     21.036582   
            std        0.562310      2.075057      1.643320    214.662668   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        1.000000      0.000000      1.000000      0.000000   
            50%        1.000000      0.000000      1.000000      0.000000   
            75%        1.000000      3.000000      3.000000      2.000000   
            max        3.000000      6.000000      6.000000  10004.000000   

                       tweet_id  
cluster_fs2                      
0           count  1.916300e+04  
            mean   1.717102e+06  
            std    7.665425e+05  
            min    3.163160e+05  
            25%    9.977970e+05  
            50%    2.017529e+06  
            75%    2.373836e+06  
            max    2.678890e+06  
1           count  5.565000e+03  
            mean   1.686093e+06  
            std    7.070837e+05  
            min    3.163310e+05  
            25%    1.007599e+06  
            50%    1.809627e+06  
            75%    2.347455e+06  
            max    2.678810e+06  
2           count  7.760000e+03  
            mean   1.701865e+06  
            std    7.698233e+05  
            min    3.164260e+05  
            25%    9.844118e+05  
            50%    2.015080e+06  
            75%    2.371116e+06  
            max    2.678763e+06  
3           count  6.535000e+03  
            mean   1.582939e+06  
            std    7.279300e+05  
            min    3.163110e+05  
            25%    9.615650e+05  
            50%    1.766396e+06  
            75%    2.118306e+06  
            max    2.678832e+06  
4           count  1.227000e+03  
            mean   1.639939e+06  
            std    2.121400e+04  
            min    1.604422e+06  
            25%    1.632815e+06  
            50%    1.637908e+06  
            75%    1.643588e+06  
            max    1.786686e+06  
5           count  1.440000e+03  
            mean   1.649164e+06  
            std    3.472618e+04  
            min    8.675960e+05  
            25%    1.634522e+06  
            50%    1.639980e+06  
            75%    1.644792e+06  
            max    1.770109e+06  
6           count  8.310000e+03  
            mean   1.696490e+06  
            std    7.627692e+05  
            min    3.163150e+05  
            25%    9.881280e+05  
            50%    2.010452e+06  
            75%    2.358236e+06  
            max    2.678850e+06  
                    cluster_fs1   cluster_fs2   cluster_fs4      retweets  \
cluster_fs3                                                                 
0           count  21101.000000  21101.000000  21101.000000  21101.000000   
            mean       2.770106      1.755130      1.156012     14.385148   
            std        1.848761      2.340575      0.608218    266.181865   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        1.000000      0.000000      1.000000      0.000000   
            50%        3.000000      0.000000      1.000000      0.000000   
            75%        4.000000      2.000000      1.000000      1.000000   
            max        6.000000      6.000000      6.000000  27137.000000   
1           count   6136.000000   6136.000000   6136.000000   6136.000000   
            mean       3.040906      2.971480      3.991362     15.660202   
            std        2.009239      0.786379      0.152432    172.023894   
            min        0.000000      0.000000      1.000000      0.000000   
            25%        1.000000      3.000000      4.000000      0.000000   
            50%        4.000000      3.000000      4.000000      0.000000   
            75%        4.000000      3.000000      4.000000      1.000000   
            max        6.000000      6.000000      6.000000   7649.000000   
2           count   9364.000000   9364.000000   9364.000000   9364.000000   
            mean       2.462196      1.956536      2.436459      7.906130   
            std        1.497252      2.242091      0.869262     90.135390   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        2.000000      0.000000      2.000000      0.000000   
            50%        2.000000      1.000000      3.000000      0.000000   
            75%        3.000000      4.000000      3.000000      1.000000   
            max        6.000000      6.000000      6.000000   5192.000000   
3           count   3021.000000   3021.000000   3021.000000   3021.000000   
            mean       2.701423      1.999338      1.508772     14.857994   
            std        1.991939      2.295816      1.353790     96.943382   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        1.000000      0.000000      1.000000      0.000000   
            50%        3.000000      1.000000      1.000000      0.000000   
            75%        4.000000      3.000000      1.000000      2.000000   
            max        6.000000      6.000000      6.000000   1991.000000   
4           count   1459.000000   1459.000000   1459.000000   1459.000000   
            mean       3.984236      4.955449      4.994517      0.016450   
            std        0.246565      0.458200      0.148047      0.212124   
            min        0.000000      0.000000      1.000000      0.000000   
            25%        4.000000      5.000000      5.000000      0.000000   
            50%        4.000000      5.000000      5.000000      0.000000   
            75%        4.000000      5.000000      5.000000      0.000000   
            max        6.000000      6.000000      5.000000      6.000000   
5           count   4975.000000   4975.000000   4975.000000   4975.000000   
            mean       4.365025      1.846432      5.882211     45.420101   
            std        2.435581      1.834691      0.600402    517.945374   
            min        0.000000      0.000000      1.000000      0.000000   
            25%        1.000000      1.000000      6.000000      1.000000   
            50%        6.000000      1.000000      6.000000      2.000000   
            75%        6.000000      2.000000      6.000000      8.000000   
            max        6.000000      6.000000      6.000000  31783.000000   
6           count   3944.000000   3944.000000   3944.000000   3944.000000   
            mean       2.816430      1.676724      0.004310     22.526876   
            std        2.039127      2.240929      0.094118    149.352267   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        1.000000      0.000000      0.000000      0.000000   
            50%        3.000000      0.000000      0.000000      0.000000   
            75%        4.000000      2.000000      0.000000      3.000000   
            max        6.000000      6.000000      3.000000   3723.000000   

                       tweet_id  
cluster_fs3                      
0           count  2.110100e+04  
            mean   1.836213e+06  
            std    6.824786e+05  
            min    3.163810e+05  
            25%    1.362619e+06  
            50%    2.046974e+06  
            75%    2.388019e+06  
            max    2.678886e+06  
1           count  6.136000e+03  
            mean   1.572405e+06  
            std    7.198915e+05  
            min    3.163170e+05  
            25%    9.614235e+05  
            50%    1.763256e+06  
            75%    2.115240e+06  
            max    2.678809e+06  
2           count  9.364000e+03  
            mean   1.179238e+06  
            std    7.709769e+05  
            min    3.163160e+05  
            25%    5.123998e+05  
            50%    9.583535e+05  
            75%    1.781745e+06  
            max    2.678890e+06  
3           count  3.021000e+03  
            mean   1.822107e+06  
            std    6.996226e+05  
            min    3.163680e+05  
            25%    1.362381e+06  
            50%    2.060101e+06  
            75%    2.379874e+06  
            max    2.678850e+06  
4           count  1.459000e+03  
            mean   1.655039e+06  
            std    8.500740e+04  
            min    6.169490e+05  
            25%    1.634538e+06  
            50%    1.640051e+06  
            75%    1.644934e+06  
            max    2.671737e+06  
5           count  4.975000e+03  
            mean   1.741655e+06  
            std    6.752916e+05  
            min    3.163310e+05  
            25%    1.069214e+06  
            50%    1.831875e+06  
            75%    2.356020e+06  
            max    2.678763e+06  
6           count  3.944000e+03  
            mean   2.105259e+06  
            std    4.979767e+05  
            min    3.163110e+05  
            25%    1.998756e+06  
            50%    2.111720e+06  
            75%    2.502467e+06  
            max    2.678849e+06  
                    cluster_fs1   cluster_fs2   cluster_fs3      retweets  \
cluster_fs4                                                                 
0           count   4283.000000   4283.000000   4283.000000   4283.000000   
            mean       2.799673      1.679897      5.719122     22.739435   
            std        2.043440      2.252323      0.988376    147.136238   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        1.000000      0.000000      6.000000      0.000000   
            50%        3.000000      0.000000      6.000000      0.000000   
            75%        4.000000      2.000000      6.000000      2.000000   
            max        6.000000      6.000000      6.000000   3723.000000   
1           count  23794.000000  23794.000000  23794.000000  23794.000000   
            mean       2.706481      1.728083      0.445280     13.427503   
            std        1.837638      2.346297      0.986194    250.776197   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        1.000000      0.000000      0.000000      0.000000   
            50%        3.000000      0.000000      0.000000      0.000000   
            75%        4.000000      2.000000      0.000000      1.000000   
            max        6.000000      6.000000      6.000000  27137.000000   
2           count   1366.000000   1366.000000   1366.000000   1366.000000   
            mean       4.737189      3.740849      1.922401      0.614202   
            std        0.957968      1.022904      0.462337      9.939365   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        5.000000      4.000000      2.000000      0.000000   
            50%        5.000000      4.000000      2.000000      0.000000   
            75%        5.000000      4.000000      2.000000      0.000000   
            max        6.000000      6.000000      5.000000    276.000000   
3           count   7331.000000   7331.000000   7331.000000   7331.000000   
            mean       2.197927      1.741918      1.802346     11.303233   
            std        1.316775      2.269350      0.885586     93.034429   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        2.000000      0.000000      2.000000      0.000000   
            50%        2.000000      1.000000      2.000000      0.000000   
            75%        2.000000      2.000000      2.000000      1.000000   
            max        6.000000      6.000000      6.000000   3680.000000   
4           count   6771.000000   6771.000000   6771.000000   6771.000000   
            mean       3.016393      2.963669      1.072958     15.048442   
            std        2.007175      0.818812      0.506395    165.790559   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        1.000000      3.000000      1.000000      0.000000   
            50%        4.000000      3.000000      1.000000      0.000000   
            75%        4.000000      3.000000      1.000000      1.000000   
            max        6.000000      6.000000      3.000000   7649.000000   
5           count   1508.000000   1508.000000   1508.000000   1508.000000   
            mean       3.937003      4.855438      3.865385      0.035146   
            std        0.462971      0.822086      0.720208      0.553170   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        4.000000      5.000000      4.000000      0.000000   
            50%        4.000000      5.000000      4.000000      0.000000   
            75%        4.000000      5.000000      4.000000      0.000000   
            max        6.000000      6.000000      4.000000     19.000000   
6           count   4947.000000   4947.000000   4947.000000   4947.000000   
            mean       4.372549      1.834647      4.907823     46.664645   
            std        2.432520      1.830597      0.542194    525.198659   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        1.000000      1.000000      5.000000      1.000000   
            50%        6.000000      1.000000      5.000000      2.000000   
            75%        6.000000      2.000000      5.000000      8.000000   
            max        6.000000      6.000000      5.000000  31783.000000   

                       tweet_id  
cluster_fs4                      
0           count  4.283000e+03  
            mean   2.107246e+06  
            std    4.989315e+05  
            min    3.163110e+05  
            25%    1.999204e+06  
            50%    2.114811e+06  
            75%    2.501846e+06  
            max    2.678849e+06  
1           count  2.379400e+04  
            mean   1.853439e+06  
            std    6.643012e+05  
            min    3.163160e+05  
            25%    1.495676e+06  
            50%    2.051500e+06  
            75%    2.388213e+06  
            max    2.678890e+06  
2           count  1.366000e+03  
            mean   1.626001e+06  
            std    2.223969e+05  
            min    3.271520e+05  
            25%    1.628331e+06  
            50%    1.637978e+06  
            75%    1.644521e+06  
            max    2.673290e+06  
3           count  7.331000e+03  
            mean   9.792313e+05  
            std    7.752636e+05  
            min    3.163500e+05  
            25%    3.352245e+05  
            50%    6.101320e+05  
            75%    1.765522e+06  
            max    2.678810e+06  
4           count  6.771000e+03  
            mean   1.563080e+06  
            std    7.261663e+05  
            min    3.163170e+05  
            25%    9.569880e+05  
            50%    1.684840e+06  
            75%    2.115740e+06  
            max    2.678832e+06  
5           count  1.508000e+03  
            mean   1.658689e+06  
            std    1.449377e+05  
            min    3.224040e+05  
            25%    1.634538e+06  
            50%    1.640169e+06  
            75%    1.645240e+06  
            max    2.671737e+06  
6           count  4.947000e+03  
            mean   1.761253e+06  
            std    6.620330e+05  
            min    3.163310e+05  
            25%    1.105948e+06  
            50%    1.836332e+06  
            75%    2.358328e+06  
            max    2.678763e+06  
/Users/shubhi/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:7: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
/Users/shubhi/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:8: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
/Users/shubhi/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:9: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
/Users/shubhi/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:10: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
length of data 50000
stemming 
CMU tagger
Running LSA
(50000, 100)
(50000, 100)
(50000, 100)
(50000, 10)
                    cluster_fs2   cluster_fs3   cluster_fs4      retweets  \
cluster_fs1                                                                 
0           count   6737.000000   6737.000000   6737.000000   6737.000000   
            mean       1.769779      1.696007      1.990055     21.885706   
            std        2.071550      2.149799      1.811759    386.541178   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        0.000000      0.000000      1.000000      0.000000   
            50%        1.000000      1.000000      1.000000      0.000000   
            75%        3.000000      3.000000      4.000000      2.000000   
            max        6.000000      6.000000      6.000000  27137.000000   
1           count   7896.000000   7896.000000   7896.000000   7896.000000   
            mean       5.291920      1.499113      2.004813     18.976570   
            std        1.562615      2.015671      1.729600    187.945780   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        6.000000      0.000000      1.000000      0.000000   
            50%        6.000000      0.000000      1.000000      0.000000   
            75%        6.000000      2.000000      3.000000      2.000000   
            max        6.000000      6.000000      6.000000   9557.000000   
2           count   6732.000000   6732.000000   6732.000000   6732.000000   
            mean       0.886958      1.985888      2.346554      9.112894   
            std        1.848126      1.190334      1.030649    108.015872   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        0.000000      2.000000      1.000000      0.000000   
            50%        0.000000      2.000000      3.000000      0.000000   
            75%        1.000000      2.000000      3.000000      1.000000   
            max        6.000000      6.000000      6.000000   5192.000000   
3           count   6716.000000   6716.000000   6716.000000   6716.000000   
            mean       1.995831      1.255360      1.793627     10.963371   
            std        1.069986      1.862743      1.512029    115.656744   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        2.000000      0.000000      1.000000      0.000000   
            50%        2.000000      0.000000      1.000000      0.000000   
            75%        2.000000      2.000000      3.000000      1.000000   
            max        6.000000      6.000000      6.000000   5812.000000   
4           count  12416.000000  12416.000000  12416.000000  12416.000000   
            mean       1.165432      1.333682      1.962629      0.861147   
            std        1.816148      1.961621      1.668741      7.339446   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        0.000000      0.000000      1.000000      0.000000   
            50%        0.000000      0.000000      1.000000      0.000000   
            75%        3.000000      3.000000      4.000000      0.000000   
            max        5.000000      6.000000      6.000000    303.000000   
5           count   1225.000000   1225.000000   1225.000000   1225.000000   
            mean       4.000000      2.000000      2.000000      0.043265   
            std        0.000000      0.000000      0.000000      0.355520   
            min        4.000000      2.000000      2.000000      0.000000   
            25%        4.000000      2.000000      2.000000      0.000000   
            50%        4.000000      2.000000      2.000000      0.000000   
            75%        4.000000      2.000000      2.000000      0.000000   
            max        4.000000      2.000000      2.000000      6.000000   
6           count   8278.000000   8278.000000   8278.000000   8278.000000   
            mean       1.232786      2.927036      3.494443     47.158734   
            std        0.925145      2.318769      2.348681    458.825604   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        1.000000      0.000000      1.000000      1.000000   
            50%        1.000000      3.000000      4.000000      2.000000   
            75%        2.000000      5.000000      6.000000      9.000000   
            max        3.000000      6.000000      6.000000  31783.000000   

                       tweet_id  
cluster_fs1                      
0           count  6.737000e+03  
            mean   1.801443e+06  
            std    7.058985e+05  
            min    3.163570e+05  
            25%    1.195259e+06  
            50%    2.042541e+06  
            75%    2.377391e+06  
            max    2.678732e+06  
1           count  7.896000e+03  
            mean   1.705424e+06  
            std    7.523128e+05  
            min    3.163150e+05  
            25%    9.946095e+05  
            50%    2.007374e+06  
            75%    2.360434e+06  
            max    2.678850e+06  
2           count  6.732000e+03  
            mean   1.158924e+06  
            std    8.316978e+05  
            min    3.163160e+05  
            25%    3.434955e+05  
            50%    7.048105e+05  
            75%    2.031512e+06  
            max    2.678890e+06  
3           count  6.716000e+03  
            mean   1.743311e+06  
            std    7.460642e+05  
            min    3.163110e+05  
            25%    1.001803e+06  
            50%    2.030056e+06  
            75%    2.376600e+06  
            max    2.678886e+06  
4           count  1.241600e+04  
            mean   1.842138e+06  
            std    6.024604e+05  
            min    3.163520e+05  
            25%    1.631508e+06  
            50%    2.002042e+06  
            75%    2.365838e+06  
            max    2.678849e+06  
5           count  1.225000e+03  
            mean   1.639788e+06  
            std    2.078353e+04  
            min    1.604422e+06  
            25%    1.632814e+06  
            50%    1.637839e+06  
            75%    1.643534e+06  
            max    1.680809e+06  
6           count  8.278000e+03  
            mean   1.731183e+06  
            std    6.952790e+05  
            min    3.163310e+05  
            25%    1.066951e+06  
            50%    1.833227e+06  
            75%    2.357281e+06  
            max    2.678810e+06  
                    cluster_fs1   cluster_fs3   cluster_fs4      retweets  \
cluster_fs2                                                                 
0           count  19163.000000  19163.000000  19163.000000  19163.000000   
            mean       2.927203      1.394041      1.411418      9.667641   
            std        1.707712      1.975340      1.186644    158.210595   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        2.000000      0.000000      1.000000      0.000000   
            50%        4.000000      0.000000      1.000000      0.000000   
            75%        4.000000      2.000000      1.000000      1.000000   
            max        6.000000      6.000000      6.000000  14819.000000   
1           count   5565.000000   5565.000000   5565.000000   5565.000000   
            mean       5.044744      3.463971      3.857143     40.458041   
            std        1.990296      2.138543      2.348095    481.668886   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        6.000000      2.000000      1.000000      1.000000   
            50%        6.000000      5.000000      6.000000      2.000000   
            75%        6.000000      5.000000      6.000000      8.000000   
            max        6.000000      6.000000      6.000000  31783.000000   
2           count   7760.000000   7760.000000   7760.000000   7760.000000   
            mean       2.845232      1.633634      1.971521     20.356959   
            std        1.617415      2.078745      1.748114    339.125413   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        3.000000      0.000000      1.000000      0.000000   
            50%        3.000000      0.000000      1.000000      0.000000   
            75%        3.000000      3.000000      3.000000      2.000000   
            max        6.000000      6.000000      6.000000  27137.000000   
3           count   6535.000000   6535.000000   6535.000000   6535.000000   
            mean       3.109870      1.236725      3.726549     13.790360   
            std        2.050760      1.026078      0.931223    155.472062   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        1.000000      1.000000      4.000000      0.000000   
            50%        4.000000      1.000000      4.000000      0.000000   
            75%        4.000000      1.000000      4.000000      2.000000   
            max        6.000000      6.000000      6.000000   7649.000000   
4           count   1227.000000   1227.000000   1227.000000   1227.000000   
            mean       4.996740      1.999185      1.999185      0.049715   
            std        0.090255      0.028548      0.028548      0.421648   
            min        2.000000      1.000000      1.000000      0.000000   
            25%        5.000000      2.000000      2.000000      0.000000   
            50%        5.000000      2.000000      2.000000      0.000000   
            75%        5.000000      2.000000      2.000000      0.000000   
            max        5.000000      2.000000      2.000000      8.000000   
5           count   1440.000000   1440.000000   1440.000000   1440.000000   
            mean       4.000000      4.000000      5.000000      0.007639   
            std        0.000000      0.000000      0.000000      0.108422   
            min        4.000000      4.000000      5.000000      0.000000   
            25%        4.000000      4.000000      5.000000      0.000000   
            50%        4.000000      4.000000      5.000000      0.000000   
            75%        4.000000      4.000000      5.000000      0.000000   
            max        4.000000      4.000000      5.000000      3.000000   
6           count   8310.000000   8310.000000   8310.000000   8310.000000   
            mean       1.023345      1.605174      1.806017     21.036582   
            std        0.562310      2.075057      1.643320    214.662668   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        1.000000      0.000000      1.000000      0.000000   
            50%        1.000000      0.000000      1.000000      0.000000   
            75%        1.000000      3.000000      3.000000      2.000000   
            max        3.000000      6.000000      6.000000  10004.000000   

                       tweet_id  
cluster_fs2                      
0           count  1.916300e+04  
            mean   1.717102e+06  
            std    7.665425e+05  
            min    3.163160e+05  
            25%    9.977970e+05  
            50%    2.017529e+06  
            75%    2.373836e+06  
            max    2.678890e+06  
1           count  5.565000e+03  
            mean   1.686093e+06  
            std    7.070837e+05  
            min    3.163310e+05  
            25%    1.007599e+06  
            50%    1.809627e+06  
            75%    2.347455e+06  
            max    2.678810e+06  
2           count  7.760000e+03  
            mean   1.701865e+06  
            std    7.698233e+05  
            min    3.164260e+05  
            25%    9.844118e+05  
            50%    2.015080e+06  
            75%    2.371116e+06  
            max    2.678763e+06  
3           count  6.535000e+03  
            mean   1.582939e+06  
            std    7.279300e+05  
            min    3.163110e+05  
            25%    9.615650e+05  
            50%    1.766396e+06  
            75%    2.118306e+06  
            max    2.678832e+06  
4           count  1.227000e+03  
            mean   1.639939e+06  
            std    2.121400e+04  
            min    1.604422e+06  
            25%    1.632815e+06  
            50%    1.637908e+06  
            75%    1.643588e+06  
            max    1.786686e+06  
5           count  1.440000e+03  
            mean   1.649164e+06  
            std    3.472618e+04  
            min    8.675960e+05  
            25%    1.634522e+06  
            50%    1.639980e+06  
            75%    1.644792e+06  
            max    1.770109e+06  
6           count  8.310000e+03  
            mean   1.696490e+06  
            std    7.627692e+05  
            min    3.163150e+05  
            25%    9.881280e+05  
            50%    2.010452e+06  
            75%    2.358236e+06  
            max    2.678850e+06  
                    cluster_fs1   cluster_fs2   cluster_fs4      retweets  \
cluster_fs3                                                                 
0           count  21101.000000  21101.000000  21101.000000  21101.000000   
            mean       2.770106      1.755130      1.156012     14.385148   
            std        1.848761      2.340575      0.608218    266.181865   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        1.000000      0.000000      1.000000      0.000000   
            50%        3.000000      0.000000      1.000000      0.000000   
            75%        4.000000      2.000000      1.000000      1.000000   
            max        6.000000      6.000000      6.000000  27137.000000   
1           count   6136.000000   6136.000000   6136.000000   6136.000000   
            mean       3.040906      2.971480      3.991362     15.660202   
            std        2.009239      0.786379      0.152432    172.023894   
            min        0.000000      0.000000      1.000000      0.000000   
            25%        1.000000      3.000000      4.000000      0.000000   
            50%        4.000000      3.000000      4.000000      0.000000   
            75%        4.000000      3.000000      4.000000      1.000000   
            max        6.000000      6.000000      6.000000   7649.000000   
2           count   9364.000000   9364.000000   9364.000000   9364.000000   
            mean       2.462196      1.956536      2.436459      7.906130   
            std        1.497252      2.242091      0.869262     90.135390   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        2.000000      0.000000      2.000000      0.000000   
            50%        2.000000      1.000000      3.000000      0.000000   
            75%        3.000000      4.000000      3.000000      1.000000   
            max        6.000000      6.000000      6.000000   5192.000000   
3           count   3021.000000   3021.000000   3021.000000   3021.000000   
            mean       2.701423      1.999338      1.508772     14.857994   
            std        1.991939      2.295816      1.353790     96.943382   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        1.000000      0.000000      1.000000      0.000000   
            50%        3.000000      1.000000      1.000000      0.000000   
            75%        4.000000      3.000000      1.000000      2.000000   
            max        6.000000      6.000000      6.000000   1991.000000   
4           count   1459.000000   1459.000000   1459.000000   1459.000000   
            mean       3.984236      4.955449      4.994517      0.016450   
            std        0.246565      0.458200      0.148047      0.212124   
            min        0.000000      0.000000      1.000000      0.000000   
            25%        4.000000      5.000000      5.000000      0.000000   
            50%        4.000000      5.000000      5.000000      0.000000   
            75%        4.000000      5.000000      5.000000      0.000000   
            max        6.000000      6.000000      5.000000      6.000000   
5           count   4975.000000   4975.000000   4975.000000   4975.000000   
            mean       4.365025      1.846432      5.882211     45.420101   
            std        2.435581      1.834691      0.600402    517.945374   
            min        0.000000      0.000000      1.000000      0.000000   
            25%        1.000000      1.000000      6.000000      1.000000   
            50%        6.000000      1.000000      6.000000      2.000000   
            75%        6.000000      2.000000      6.000000      8.000000   
            max        6.000000      6.000000      6.000000  31783.000000   
6           count   3944.000000   3944.000000   3944.000000   3944.000000   
            mean       2.816430      1.676724      0.004310     22.526876   
            std        2.039127      2.240929      0.094118    149.352267   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        1.000000      0.000000      0.000000      0.000000   
            50%        3.000000      0.000000      0.000000      0.000000   
            75%        4.000000      2.000000      0.000000      3.000000   
            max        6.000000      6.000000      3.000000   3723.000000   

                       tweet_id  
cluster_fs3                      
0           count  2.110100e+04  
            mean   1.836213e+06  
            std    6.824786e+05  
            min    3.163810e+05  
            25%    1.362619e+06  
            50%    2.046974e+06  
            75%    2.388019e+06  
            max    2.678886e+06  
1           count  6.136000e+03  
            mean   1.572405e+06  
            std    7.198915e+05  
            min    3.163170e+05  
            25%    9.614235e+05  
            50%    1.763256e+06  
            75%    2.115240e+06  
            max    2.678809e+06  
2           count  9.364000e+03  
            mean   1.179238e+06  
            std    7.709769e+05  
            min    3.163160e+05  
            25%    5.123998e+05  
            50%    9.583535e+05  
            75%    1.781745e+06  
            max    2.678890e+06  
3           count  3.021000e+03  
            mean   1.822107e+06  
            std    6.996226e+05  
            min    3.163680e+05  
            25%    1.362381e+06  
            50%    2.060101e+06  
            75%    2.379874e+06  
            max    2.678850e+06  
4           count  1.459000e+03  
            mean   1.655039e+06  
            std    8.500740e+04  
            min    6.169490e+05  
            25%    1.634538e+06  
            50%    1.640051e+06  
            75%    1.644934e+06  
            max    2.671737e+06  
5           count  4.975000e+03  
            mean   1.741655e+06  
            std    6.752916e+05  
            min    3.163310e+05  
            25%    1.069214e+06  
            50%    1.831875e+06  
            75%    2.356020e+06  
            max    2.678763e+06  
6           count  3.944000e+03  
            mean   2.105259e+06  
            std    4.979767e+05  
            min    3.163110e+05  
            25%    1.998756e+06  
            50%    2.111720e+06  
            75%    2.502467e+06  
            max    2.678849e+06  
                    cluster_fs1   cluster_fs2   cluster_fs3      retweets  \
cluster_fs4                                                                 
0           count   4283.000000   4283.000000   4283.000000   4283.000000   
            mean       2.799673      1.679897      5.719122     22.739435   
            std        2.043440      2.252323      0.988376    147.136238   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        1.000000      0.000000      6.000000      0.000000   
            50%        3.000000      0.000000      6.000000      0.000000   
            75%        4.000000      2.000000      6.000000      2.000000   
            max        6.000000      6.000000      6.000000   3723.000000   
1           count  23794.000000  23794.000000  23794.000000  23794.000000   
            mean       2.706481      1.728083      0.445280     13.427503   
            std        1.837638      2.346297      0.986194    250.776197   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        1.000000      0.000000      0.000000      0.000000   
            50%        3.000000      0.000000      0.000000      0.000000   
            75%        4.000000      2.000000      0.000000      1.000000   
            max        6.000000      6.000000      6.000000  27137.000000   
2           count   1366.000000   1366.000000   1366.000000   1366.000000   
            mean       4.737189      3.740849      1.922401      0.614202   
            std        0.957968      1.022904      0.462337      9.939365   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        5.000000      4.000000      2.000000      0.000000   
            50%        5.000000      4.000000      2.000000      0.000000   
            75%        5.000000      4.000000      2.000000      0.000000   
            max        6.000000      6.000000      5.000000    276.000000   
3           count   7331.000000   7331.000000   7331.000000   7331.000000   
            mean       2.197927      1.741918      1.802346     11.303233   
            std        1.316775      2.269350      0.885586     93.034429   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        2.000000      0.000000      2.000000      0.000000   
            50%        2.000000      1.000000      2.000000      0.000000   
            75%        2.000000      2.000000      2.000000      1.000000   
            max        6.000000      6.000000      6.000000   3680.000000   
4           count   6771.000000   6771.000000   6771.000000   6771.000000   
            mean       3.016393      2.963669      1.072958     15.048442   
            std        2.007175      0.818812      0.506395    165.790559   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        1.000000      3.000000      1.000000      0.000000   
            50%        4.000000      3.000000      1.000000      0.000000   
            75%        4.000000      3.000000      1.000000      1.000000   
            max        6.000000      6.000000      3.000000   7649.000000   
5           count   1508.000000   1508.000000   1508.000000   1508.000000   
            mean       3.937003      4.855438      3.865385      0.035146   
            std        0.462971      0.822086      0.720208      0.553170   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        4.000000      5.000000      4.000000      0.000000   
            50%        4.000000      5.000000      4.000000      0.000000   
            75%        4.000000      5.000000      4.000000      0.000000   
            max        6.000000      6.000000      4.000000     19.000000   
6           count   4947.000000   4947.000000   4947.000000   4947.000000   
            mean       4.372549      1.834647      4.907823     46.664645   
            std        2.432520      1.830597      0.542194    525.198659   
            min        0.000000      0.000000      0.000000      0.000000   
            25%        1.000000      1.000000      5.000000      1.000000   
            50%        6.000000      1.000000      5.000000      2.000000   
            75%        6.000000      2.000000      5.000000      8.000000   
            max        6.000000      6.000000      5.000000  31783.000000   

                       tweet_id  
cluster_fs4                      
0           count  4.283000e+03  
            mean   2.107246e+06  
            std    4.989315e+05  
            min    3.163110e+05  
            25%    1.999204e+06  
            50%    2.114811e+06  
            75%    2.501846e+06  
            max    2.678849e+06  
1           count  2.379400e+04  
            mean   1.853439e+06  
            std    6.643012e+05  
            min    3.163160e+05  
            25%    1.495676e+06  
            50%    2.051500e+06  
            75%    2.388213e+06  
            max    2.678890e+06  
2           count  1.366000e+03  
            mean   1.626001e+06  
            std    2.223969e+05  
            min    3.271520e+05  
            25%    1.628331e+06  
            50%    1.637978e+06  
            75%    1.644521e+06  
            max    2.673290e+06  
3           count  7.331000e+03  
            mean   9.792313e+05  
            std    7.752636e+05  
            min    3.163500e+05  
            25%    3.352245e+05  
            50%    6.101320e+05  
            75%    1.765522e+06  
            max    2.678810e+06  
4           count  6.771000e+03  
            mean   1.563080e+06  
            std    7.261663e+05  
            min    3.163170e+05  
            25%    9.569880e+05  
            50%    1.684840e+06  
            75%    2.115740e+06  
            max    2.678832e+06  
5           count  1.508000e+03  
            mean   1.658689e+06  
            std    1.449377e+05  
            min    3.224040e+05  
            25%    1.634538e+06  
            50%    1.640169e+06  
            75%    1.645240e+06  
            max    2.671737e+06  
6           count  4.947000e+03  
            mean   1.761253e+06  
            std    6.620330e+05  
            min    3.163310e+05  
            25%    1.105948e+06  
            50%    1.836332e+06  
            75%    2.358328e+06  
            max    2.678763e+06  

In [ ]:


In [ ]:


In [ ]:


In [ ]: