In [6]:
from preprocess import *
from feature_extraction import *
from lsi import *
from kmeans import *
import CMUTweetTagger
def get_feature_sets(filename):
df = preprocess(filename)
print "length of data", len(df)
data_sample = df['text'].str.lower()
#TFIDF features without text processing
[data_fs2, vectorizer, no_features] = vectorize(data_sample, TFIDF) #feature set 2
#Unigram features without text processing
[data_fs1, vectorizer, no_features] = vectorize(data_sample, UNI) #Feature set 1
#Text preprocessing - stopwords, stemming, lowercase
data_fs3 = tokenize_and_stopwords(data_sample)
data_fs3 = stemmer(data_fs3)
#use CMU tagger and remove NNP and NNPS
print "CMU tagger"
all_tags = CMUTweetTagger.runtagger_parse(data_fs3)
for i in range(len(all_tags)):
for tag in all_tags[i]:
# print tag[1]
if tag[1] == 'NNP' or tag[1] == 'NNPS':
data_fs3[i] = data_fs3[i].replace(tag[0], '')
[data_fs3, vectorizer, no_features] = vectorize(data_fs3, TFIDF) #Feature set 3
data_fs4 = lsa(data_fs3) #feature set 4
print data_fs1.shape
print data_fs2.shape
print data_fs3.shape
print data_fs4.shape
return [data_fs1, data_fs2, data_fs3, data_fs4, df]
In [7]:
def kmeans_analysis(filename):
no_clusters = 5
filename = "clinton-50k.csv"
[data_fs1, data_fs2, data_fs3, data_fs4, df1] = get_feature_sets(filename)
df = df1[['tweet_id','text', 'retweets']].copy()
df['cluster_fs1'] = run_kmeans(data_fs1, 7)
df['cluster_fs2'] = run_kmeans(data_fs2, 7)
df['cluster_fs3'] = run_kmeans(data_fs3, 7)
df['cluster_fs4'] = run_kmeans(data_fs4, 7)
result_filename = filename.replace(".csv", "") +"-test.csv"
df.to_csv(result_filename)
return df
In [8]:
def perform_analysis(df):
print df.groupby(['cluster_fs1']).describe()
print df.groupby(['cluster_fs2']).describe()
print df.groupby(['cluster_fs3']).describe()
print df.groupby(['cluster_fs4']).describe()
df.corr()
result1 = df.sort(['cluster_fs1'])
result2 = df.sort(['cluster_fs2'])
result3 = df.sort(['cluster_fs3'])
result4 = df.sort(['cluster_fs4'])
result4
In [9]:
def main():
f1 = "trump-50k.csv"
f2 = "clinton-50k.csv"
df1 = kmeans_analysis(f1)
perform_analysis(df1)
df2 = kmeans_analysis(f2)
perform_analysis(df2)
In [10]:
main()
length of data 50000
stemming
CMU tagger
Running LSA
(50000, 100)
(50000, 100)
(50000, 100)
(50000, 10)
cluster_fs2 cluster_fs3 cluster_fs4 retweets \
cluster_fs1
0 count 6737.000000 6737.000000 6737.000000 6737.000000
mean 1.769779 1.696007 1.990055 21.885706
std 2.071550 2.149799 1.811759 386.541178
min 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 1.000000 0.000000
50% 1.000000 1.000000 1.000000 0.000000
75% 3.000000 3.000000 4.000000 2.000000
max 6.000000 6.000000 6.000000 27137.000000
1 count 7896.000000 7896.000000 7896.000000 7896.000000
mean 5.291920 1.499113 2.004813 18.976570
std 1.562615 2.015671 1.729600 187.945780
min 0.000000 0.000000 0.000000 0.000000
25% 6.000000 0.000000 1.000000 0.000000
50% 6.000000 0.000000 1.000000 0.000000
75% 6.000000 2.000000 3.000000 2.000000
max 6.000000 6.000000 6.000000 9557.000000
2 count 6732.000000 6732.000000 6732.000000 6732.000000
mean 0.886958 1.985888 2.346554 9.112894
std 1.848126 1.190334 1.030649 108.015872
min 0.000000 0.000000 0.000000 0.000000
25% 0.000000 2.000000 1.000000 0.000000
50% 0.000000 2.000000 3.000000 0.000000
75% 1.000000 2.000000 3.000000 1.000000
max 6.000000 6.000000 6.000000 5192.000000
3 count 6716.000000 6716.000000 6716.000000 6716.000000
mean 1.995831 1.255360 1.793627 10.963371
std 1.069986 1.862743 1.512029 115.656744
min 0.000000 0.000000 0.000000 0.000000
25% 2.000000 0.000000 1.000000 0.000000
50% 2.000000 0.000000 1.000000 0.000000
75% 2.000000 2.000000 3.000000 1.000000
max 6.000000 6.000000 6.000000 5812.000000
4 count 12416.000000 12416.000000 12416.000000 12416.000000
mean 1.165432 1.333682 1.962629 0.861147
std 1.816148 1.961621 1.668741 7.339446
min 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 1.000000 0.000000
50% 0.000000 0.000000 1.000000 0.000000
75% 3.000000 3.000000 4.000000 0.000000
max 5.000000 6.000000 6.000000 303.000000
5 count 1225.000000 1225.000000 1225.000000 1225.000000
mean 4.000000 2.000000 2.000000 0.043265
std 0.000000 0.000000 0.000000 0.355520
min 4.000000 2.000000 2.000000 0.000000
25% 4.000000 2.000000 2.000000 0.000000
50% 4.000000 2.000000 2.000000 0.000000
75% 4.000000 2.000000 2.000000 0.000000
max 4.000000 2.000000 2.000000 6.000000
6 count 8278.000000 8278.000000 8278.000000 8278.000000
mean 1.232786 2.927036 3.494443 47.158734
std 0.925145 2.318769 2.348681 458.825604
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 0.000000 1.000000 1.000000
50% 1.000000 3.000000 4.000000 2.000000
75% 2.000000 5.000000 6.000000 9.000000
max 3.000000 6.000000 6.000000 31783.000000
tweet_id
cluster_fs1
0 count 6.737000e+03
mean 1.801443e+06
std 7.058985e+05
min 3.163570e+05
25% 1.195259e+06
50% 2.042541e+06
75% 2.377391e+06
max 2.678732e+06
1 count 7.896000e+03
mean 1.705424e+06
std 7.523128e+05
min 3.163150e+05
25% 9.946095e+05
50% 2.007374e+06
75% 2.360434e+06
max 2.678850e+06
2 count 6.732000e+03
mean 1.158924e+06
std 8.316978e+05
min 3.163160e+05
25% 3.434955e+05
50% 7.048105e+05
75% 2.031512e+06
max 2.678890e+06
3 count 6.716000e+03
mean 1.743311e+06
std 7.460642e+05
min 3.163110e+05
25% 1.001803e+06
50% 2.030056e+06
75% 2.376600e+06
max 2.678886e+06
4 count 1.241600e+04
mean 1.842138e+06
std 6.024604e+05
min 3.163520e+05
25% 1.631508e+06
50% 2.002042e+06
75% 2.365838e+06
max 2.678849e+06
5 count 1.225000e+03
mean 1.639788e+06
std 2.078353e+04
min 1.604422e+06
25% 1.632814e+06
50% 1.637839e+06
75% 1.643534e+06
max 1.680809e+06
6 count 8.278000e+03
mean 1.731183e+06
std 6.952790e+05
min 3.163310e+05
25% 1.066951e+06
50% 1.833227e+06
75% 2.357281e+06
max 2.678810e+06
cluster_fs1 cluster_fs3 cluster_fs4 retweets \
cluster_fs2
0 count 19163.000000 19163.000000 19163.000000 19163.000000
mean 2.927203 1.394041 1.411418 9.667641
std 1.707712 1.975340 1.186644 158.210595
min 0.000000 0.000000 0.000000 0.000000
25% 2.000000 0.000000 1.000000 0.000000
50% 4.000000 0.000000 1.000000 0.000000
75% 4.000000 2.000000 1.000000 1.000000
max 6.000000 6.000000 6.000000 14819.000000
1 count 5565.000000 5565.000000 5565.000000 5565.000000
mean 5.044744 3.463971 3.857143 40.458041
std 1.990296 2.138543 2.348095 481.668886
min 0.000000 0.000000 0.000000 0.000000
25% 6.000000 2.000000 1.000000 1.000000
50% 6.000000 5.000000 6.000000 2.000000
75% 6.000000 5.000000 6.000000 8.000000
max 6.000000 6.000000 6.000000 31783.000000
2 count 7760.000000 7760.000000 7760.000000 7760.000000
mean 2.845232 1.633634 1.971521 20.356959
std 1.617415 2.078745 1.748114 339.125413
min 0.000000 0.000000 0.000000 0.000000
25% 3.000000 0.000000 1.000000 0.000000
50% 3.000000 0.000000 1.000000 0.000000
75% 3.000000 3.000000 3.000000 2.000000
max 6.000000 6.000000 6.000000 27137.000000
3 count 6535.000000 6535.000000 6535.000000 6535.000000
mean 3.109870 1.236725 3.726549 13.790360
std 2.050760 1.026078 0.931223 155.472062
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 1.000000 4.000000 0.000000
50% 4.000000 1.000000 4.000000 0.000000
75% 4.000000 1.000000 4.000000 2.000000
max 6.000000 6.000000 6.000000 7649.000000
4 count 1227.000000 1227.000000 1227.000000 1227.000000
mean 4.996740 1.999185 1.999185 0.049715
std 0.090255 0.028548 0.028548 0.421648
min 2.000000 1.000000 1.000000 0.000000
25% 5.000000 2.000000 2.000000 0.000000
50% 5.000000 2.000000 2.000000 0.000000
75% 5.000000 2.000000 2.000000 0.000000
max 5.000000 2.000000 2.000000 8.000000
5 count 1440.000000 1440.000000 1440.000000 1440.000000
mean 4.000000 4.000000 5.000000 0.007639
std 0.000000 0.000000 0.000000 0.108422
min 4.000000 4.000000 5.000000 0.000000
25% 4.000000 4.000000 5.000000 0.000000
50% 4.000000 4.000000 5.000000 0.000000
75% 4.000000 4.000000 5.000000 0.000000
max 4.000000 4.000000 5.000000 3.000000
6 count 8310.000000 8310.000000 8310.000000 8310.000000
mean 1.023345 1.605174 1.806017 21.036582
std 0.562310 2.075057 1.643320 214.662668
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 0.000000 1.000000 0.000000
50% 1.000000 0.000000 1.000000 0.000000
75% 1.000000 3.000000 3.000000 2.000000
max 3.000000 6.000000 6.000000 10004.000000
tweet_id
cluster_fs2
0 count 1.916300e+04
mean 1.717102e+06
std 7.665425e+05
min 3.163160e+05
25% 9.977970e+05
50% 2.017529e+06
75% 2.373836e+06
max 2.678890e+06
1 count 5.565000e+03
mean 1.686093e+06
std 7.070837e+05
min 3.163310e+05
25% 1.007599e+06
50% 1.809627e+06
75% 2.347455e+06
max 2.678810e+06
2 count 7.760000e+03
mean 1.701865e+06
std 7.698233e+05
min 3.164260e+05
25% 9.844118e+05
50% 2.015080e+06
75% 2.371116e+06
max 2.678763e+06
3 count 6.535000e+03
mean 1.582939e+06
std 7.279300e+05
min 3.163110e+05
25% 9.615650e+05
50% 1.766396e+06
75% 2.118306e+06
max 2.678832e+06
4 count 1.227000e+03
mean 1.639939e+06
std 2.121400e+04
min 1.604422e+06
25% 1.632815e+06
50% 1.637908e+06
75% 1.643588e+06
max 1.786686e+06
5 count 1.440000e+03
mean 1.649164e+06
std 3.472618e+04
min 8.675960e+05
25% 1.634522e+06
50% 1.639980e+06
75% 1.644792e+06
max 1.770109e+06
6 count 8.310000e+03
mean 1.696490e+06
std 7.627692e+05
min 3.163150e+05
25% 9.881280e+05
50% 2.010452e+06
75% 2.358236e+06
max 2.678850e+06
cluster_fs1 cluster_fs2 cluster_fs4 retweets \
cluster_fs3
0 count 21101.000000 21101.000000 21101.000000 21101.000000
mean 2.770106 1.755130 1.156012 14.385148
std 1.848761 2.340575 0.608218 266.181865
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 0.000000 1.000000 0.000000
50% 3.000000 0.000000 1.000000 0.000000
75% 4.000000 2.000000 1.000000 1.000000
max 6.000000 6.000000 6.000000 27137.000000
1 count 6136.000000 6136.000000 6136.000000 6136.000000
mean 3.040906 2.971480 3.991362 15.660202
std 2.009239 0.786379 0.152432 172.023894
min 0.000000 0.000000 1.000000 0.000000
25% 1.000000 3.000000 4.000000 0.000000
50% 4.000000 3.000000 4.000000 0.000000
75% 4.000000 3.000000 4.000000 1.000000
max 6.000000 6.000000 6.000000 7649.000000
2 count 9364.000000 9364.000000 9364.000000 9364.000000
mean 2.462196 1.956536 2.436459 7.906130
std 1.497252 2.242091 0.869262 90.135390
min 0.000000 0.000000 0.000000 0.000000
25% 2.000000 0.000000 2.000000 0.000000
50% 2.000000 1.000000 3.000000 0.000000
75% 3.000000 4.000000 3.000000 1.000000
max 6.000000 6.000000 6.000000 5192.000000
3 count 3021.000000 3021.000000 3021.000000 3021.000000
mean 2.701423 1.999338 1.508772 14.857994
std 1.991939 2.295816 1.353790 96.943382
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 0.000000 1.000000 0.000000
50% 3.000000 1.000000 1.000000 0.000000
75% 4.000000 3.000000 1.000000 2.000000
max 6.000000 6.000000 6.000000 1991.000000
4 count 1459.000000 1459.000000 1459.000000 1459.000000
mean 3.984236 4.955449 4.994517 0.016450
std 0.246565 0.458200 0.148047 0.212124
min 0.000000 0.000000 1.000000 0.000000
25% 4.000000 5.000000 5.000000 0.000000
50% 4.000000 5.000000 5.000000 0.000000
75% 4.000000 5.000000 5.000000 0.000000
max 6.000000 6.000000 5.000000 6.000000
5 count 4975.000000 4975.000000 4975.000000 4975.000000
mean 4.365025 1.846432 5.882211 45.420101
std 2.435581 1.834691 0.600402 517.945374
min 0.000000 0.000000 1.000000 0.000000
25% 1.000000 1.000000 6.000000 1.000000
50% 6.000000 1.000000 6.000000 2.000000
75% 6.000000 2.000000 6.000000 8.000000
max 6.000000 6.000000 6.000000 31783.000000
6 count 3944.000000 3944.000000 3944.000000 3944.000000
mean 2.816430 1.676724 0.004310 22.526876
std 2.039127 2.240929 0.094118 149.352267
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 0.000000 0.000000 0.000000
50% 3.000000 0.000000 0.000000 0.000000
75% 4.000000 2.000000 0.000000 3.000000
max 6.000000 6.000000 3.000000 3723.000000
tweet_id
cluster_fs3
0 count 2.110100e+04
mean 1.836213e+06
std 6.824786e+05
min 3.163810e+05
25% 1.362619e+06
50% 2.046974e+06
75% 2.388019e+06
max 2.678886e+06
1 count 6.136000e+03
mean 1.572405e+06
std 7.198915e+05
min 3.163170e+05
25% 9.614235e+05
50% 1.763256e+06
75% 2.115240e+06
max 2.678809e+06
2 count 9.364000e+03
mean 1.179238e+06
std 7.709769e+05
min 3.163160e+05
25% 5.123998e+05
50% 9.583535e+05
75% 1.781745e+06
max 2.678890e+06
3 count 3.021000e+03
mean 1.822107e+06
std 6.996226e+05
min 3.163680e+05
25% 1.362381e+06
50% 2.060101e+06
75% 2.379874e+06
max 2.678850e+06
4 count 1.459000e+03
mean 1.655039e+06
std 8.500740e+04
min 6.169490e+05
25% 1.634538e+06
50% 1.640051e+06
75% 1.644934e+06
max 2.671737e+06
5 count 4.975000e+03
mean 1.741655e+06
std 6.752916e+05
min 3.163310e+05
25% 1.069214e+06
50% 1.831875e+06
75% 2.356020e+06
max 2.678763e+06
6 count 3.944000e+03
mean 2.105259e+06
std 4.979767e+05
min 3.163110e+05
25% 1.998756e+06
50% 2.111720e+06
75% 2.502467e+06
max 2.678849e+06
cluster_fs1 cluster_fs2 cluster_fs3 retweets \
cluster_fs4
0 count 4283.000000 4283.000000 4283.000000 4283.000000
mean 2.799673 1.679897 5.719122 22.739435
std 2.043440 2.252323 0.988376 147.136238
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 0.000000 6.000000 0.000000
50% 3.000000 0.000000 6.000000 0.000000
75% 4.000000 2.000000 6.000000 2.000000
max 6.000000 6.000000 6.000000 3723.000000
1 count 23794.000000 23794.000000 23794.000000 23794.000000
mean 2.706481 1.728083 0.445280 13.427503
std 1.837638 2.346297 0.986194 250.776197
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 0.000000 0.000000 0.000000
50% 3.000000 0.000000 0.000000 0.000000
75% 4.000000 2.000000 0.000000 1.000000
max 6.000000 6.000000 6.000000 27137.000000
2 count 1366.000000 1366.000000 1366.000000 1366.000000
mean 4.737189 3.740849 1.922401 0.614202
std 0.957968 1.022904 0.462337 9.939365
min 0.000000 0.000000 0.000000 0.000000
25% 5.000000 4.000000 2.000000 0.000000
50% 5.000000 4.000000 2.000000 0.000000
75% 5.000000 4.000000 2.000000 0.000000
max 6.000000 6.000000 5.000000 276.000000
3 count 7331.000000 7331.000000 7331.000000 7331.000000
mean 2.197927 1.741918 1.802346 11.303233
std 1.316775 2.269350 0.885586 93.034429
min 0.000000 0.000000 0.000000 0.000000
25% 2.000000 0.000000 2.000000 0.000000
50% 2.000000 1.000000 2.000000 0.000000
75% 2.000000 2.000000 2.000000 1.000000
max 6.000000 6.000000 6.000000 3680.000000
4 count 6771.000000 6771.000000 6771.000000 6771.000000
mean 3.016393 2.963669 1.072958 15.048442
std 2.007175 0.818812 0.506395 165.790559
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 3.000000 1.000000 0.000000
50% 4.000000 3.000000 1.000000 0.000000
75% 4.000000 3.000000 1.000000 1.000000
max 6.000000 6.000000 3.000000 7649.000000
5 count 1508.000000 1508.000000 1508.000000 1508.000000
mean 3.937003 4.855438 3.865385 0.035146
std 0.462971 0.822086 0.720208 0.553170
min 0.000000 0.000000 0.000000 0.000000
25% 4.000000 5.000000 4.000000 0.000000
50% 4.000000 5.000000 4.000000 0.000000
75% 4.000000 5.000000 4.000000 0.000000
max 6.000000 6.000000 4.000000 19.000000
6 count 4947.000000 4947.000000 4947.000000 4947.000000
mean 4.372549 1.834647 4.907823 46.664645
std 2.432520 1.830597 0.542194 525.198659
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 1.000000 5.000000 1.000000
50% 6.000000 1.000000 5.000000 2.000000
75% 6.000000 2.000000 5.000000 8.000000
max 6.000000 6.000000 5.000000 31783.000000
tweet_id
cluster_fs4
0 count 4.283000e+03
mean 2.107246e+06
std 4.989315e+05
min 3.163110e+05
25% 1.999204e+06
50% 2.114811e+06
75% 2.501846e+06
max 2.678849e+06
1 count 2.379400e+04
mean 1.853439e+06
std 6.643012e+05
min 3.163160e+05
25% 1.495676e+06
50% 2.051500e+06
75% 2.388213e+06
max 2.678890e+06
2 count 1.366000e+03
mean 1.626001e+06
std 2.223969e+05
min 3.271520e+05
25% 1.628331e+06
50% 1.637978e+06
75% 1.644521e+06
max 2.673290e+06
3 count 7.331000e+03
mean 9.792313e+05
std 7.752636e+05
min 3.163500e+05
25% 3.352245e+05
50% 6.101320e+05
75% 1.765522e+06
max 2.678810e+06
4 count 6.771000e+03
mean 1.563080e+06
std 7.261663e+05
min 3.163170e+05
25% 9.569880e+05
50% 1.684840e+06
75% 2.115740e+06
max 2.678832e+06
5 count 1.508000e+03
mean 1.658689e+06
std 1.449377e+05
min 3.224040e+05
25% 1.634538e+06
50% 1.640169e+06
75% 1.645240e+06
max 2.671737e+06
6 count 4.947000e+03
mean 1.761253e+06
std 6.620330e+05
min 3.163310e+05
25% 1.105948e+06
50% 1.836332e+06
75% 2.358328e+06
max 2.678763e+06
/Users/shubhi/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:7: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
/Users/shubhi/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:8: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
/Users/shubhi/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:9: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
/Users/shubhi/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:10: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
length of data 50000
stemming
CMU tagger
Running LSA
(50000, 100)
(50000, 100)
(50000, 100)
(50000, 10)
cluster_fs2 cluster_fs3 cluster_fs4 retweets \
cluster_fs1
0 count 6737.000000 6737.000000 6737.000000 6737.000000
mean 1.769779 1.696007 1.990055 21.885706
std 2.071550 2.149799 1.811759 386.541178
min 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 1.000000 0.000000
50% 1.000000 1.000000 1.000000 0.000000
75% 3.000000 3.000000 4.000000 2.000000
max 6.000000 6.000000 6.000000 27137.000000
1 count 7896.000000 7896.000000 7896.000000 7896.000000
mean 5.291920 1.499113 2.004813 18.976570
std 1.562615 2.015671 1.729600 187.945780
min 0.000000 0.000000 0.000000 0.000000
25% 6.000000 0.000000 1.000000 0.000000
50% 6.000000 0.000000 1.000000 0.000000
75% 6.000000 2.000000 3.000000 2.000000
max 6.000000 6.000000 6.000000 9557.000000
2 count 6732.000000 6732.000000 6732.000000 6732.000000
mean 0.886958 1.985888 2.346554 9.112894
std 1.848126 1.190334 1.030649 108.015872
min 0.000000 0.000000 0.000000 0.000000
25% 0.000000 2.000000 1.000000 0.000000
50% 0.000000 2.000000 3.000000 0.000000
75% 1.000000 2.000000 3.000000 1.000000
max 6.000000 6.000000 6.000000 5192.000000
3 count 6716.000000 6716.000000 6716.000000 6716.000000
mean 1.995831 1.255360 1.793627 10.963371
std 1.069986 1.862743 1.512029 115.656744
min 0.000000 0.000000 0.000000 0.000000
25% 2.000000 0.000000 1.000000 0.000000
50% 2.000000 0.000000 1.000000 0.000000
75% 2.000000 2.000000 3.000000 1.000000
max 6.000000 6.000000 6.000000 5812.000000
4 count 12416.000000 12416.000000 12416.000000 12416.000000
mean 1.165432 1.333682 1.962629 0.861147
std 1.816148 1.961621 1.668741 7.339446
min 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 1.000000 0.000000
50% 0.000000 0.000000 1.000000 0.000000
75% 3.000000 3.000000 4.000000 0.000000
max 5.000000 6.000000 6.000000 303.000000
5 count 1225.000000 1225.000000 1225.000000 1225.000000
mean 4.000000 2.000000 2.000000 0.043265
std 0.000000 0.000000 0.000000 0.355520
min 4.000000 2.000000 2.000000 0.000000
25% 4.000000 2.000000 2.000000 0.000000
50% 4.000000 2.000000 2.000000 0.000000
75% 4.000000 2.000000 2.000000 0.000000
max 4.000000 2.000000 2.000000 6.000000
6 count 8278.000000 8278.000000 8278.000000 8278.000000
mean 1.232786 2.927036 3.494443 47.158734
std 0.925145 2.318769 2.348681 458.825604
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 0.000000 1.000000 1.000000
50% 1.000000 3.000000 4.000000 2.000000
75% 2.000000 5.000000 6.000000 9.000000
max 3.000000 6.000000 6.000000 31783.000000
tweet_id
cluster_fs1
0 count 6.737000e+03
mean 1.801443e+06
std 7.058985e+05
min 3.163570e+05
25% 1.195259e+06
50% 2.042541e+06
75% 2.377391e+06
max 2.678732e+06
1 count 7.896000e+03
mean 1.705424e+06
std 7.523128e+05
min 3.163150e+05
25% 9.946095e+05
50% 2.007374e+06
75% 2.360434e+06
max 2.678850e+06
2 count 6.732000e+03
mean 1.158924e+06
std 8.316978e+05
min 3.163160e+05
25% 3.434955e+05
50% 7.048105e+05
75% 2.031512e+06
max 2.678890e+06
3 count 6.716000e+03
mean 1.743311e+06
std 7.460642e+05
min 3.163110e+05
25% 1.001803e+06
50% 2.030056e+06
75% 2.376600e+06
max 2.678886e+06
4 count 1.241600e+04
mean 1.842138e+06
std 6.024604e+05
min 3.163520e+05
25% 1.631508e+06
50% 2.002042e+06
75% 2.365838e+06
max 2.678849e+06
5 count 1.225000e+03
mean 1.639788e+06
std 2.078353e+04
min 1.604422e+06
25% 1.632814e+06
50% 1.637839e+06
75% 1.643534e+06
max 1.680809e+06
6 count 8.278000e+03
mean 1.731183e+06
std 6.952790e+05
min 3.163310e+05
25% 1.066951e+06
50% 1.833227e+06
75% 2.357281e+06
max 2.678810e+06
cluster_fs1 cluster_fs3 cluster_fs4 retweets \
cluster_fs2
0 count 19163.000000 19163.000000 19163.000000 19163.000000
mean 2.927203 1.394041 1.411418 9.667641
std 1.707712 1.975340 1.186644 158.210595
min 0.000000 0.000000 0.000000 0.000000
25% 2.000000 0.000000 1.000000 0.000000
50% 4.000000 0.000000 1.000000 0.000000
75% 4.000000 2.000000 1.000000 1.000000
max 6.000000 6.000000 6.000000 14819.000000
1 count 5565.000000 5565.000000 5565.000000 5565.000000
mean 5.044744 3.463971 3.857143 40.458041
std 1.990296 2.138543 2.348095 481.668886
min 0.000000 0.000000 0.000000 0.000000
25% 6.000000 2.000000 1.000000 1.000000
50% 6.000000 5.000000 6.000000 2.000000
75% 6.000000 5.000000 6.000000 8.000000
max 6.000000 6.000000 6.000000 31783.000000
2 count 7760.000000 7760.000000 7760.000000 7760.000000
mean 2.845232 1.633634 1.971521 20.356959
std 1.617415 2.078745 1.748114 339.125413
min 0.000000 0.000000 0.000000 0.000000
25% 3.000000 0.000000 1.000000 0.000000
50% 3.000000 0.000000 1.000000 0.000000
75% 3.000000 3.000000 3.000000 2.000000
max 6.000000 6.000000 6.000000 27137.000000
3 count 6535.000000 6535.000000 6535.000000 6535.000000
mean 3.109870 1.236725 3.726549 13.790360
std 2.050760 1.026078 0.931223 155.472062
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 1.000000 4.000000 0.000000
50% 4.000000 1.000000 4.000000 0.000000
75% 4.000000 1.000000 4.000000 2.000000
max 6.000000 6.000000 6.000000 7649.000000
4 count 1227.000000 1227.000000 1227.000000 1227.000000
mean 4.996740 1.999185 1.999185 0.049715
std 0.090255 0.028548 0.028548 0.421648
min 2.000000 1.000000 1.000000 0.000000
25% 5.000000 2.000000 2.000000 0.000000
50% 5.000000 2.000000 2.000000 0.000000
75% 5.000000 2.000000 2.000000 0.000000
max 5.000000 2.000000 2.000000 8.000000
5 count 1440.000000 1440.000000 1440.000000 1440.000000
mean 4.000000 4.000000 5.000000 0.007639
std 0.000000 0.000000 0.000000 0.108422
min 4.000000 4.000000 5.000000 0.000000
25% 4.000000 4.000000 5.000000 0.000000
50% 4.000000 4.000000 5.000000 0.000000
75% 4.000000 4.000000 5.000000 0.000000
max 4.000000 4.000000 5.000000 3.000000
6 count 8310.000000 8310.000000 8310.000000 8310.000000
mean 1.023345 1.605174 1.806017 21.036582
std 0.562310 2.075057 1.643320 214.662668
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 0.000000 1.000000 0.000000
50% 1.000000 0.000000 1.000000 0.000000
75% 1.000000 3.000000 3.000000 2.000000
max 3.000000 6.000000 6.000000 10004.000000
tweet_id
cluster_fs2
0 count 1.916300e+04
mean 1.717102e+06
std 7.665425e+05
min 3.163160e+05
25% 9.977970e+05
50% 2.017529e+06
75% 2.373836e+06
max 2.678890e+06
1 count 5.565000e+03
mean 1.686093e+06
std 7.070837e+05
min 3.163310e+05
25% 1.007599e+06
50% 1.809627e+06
75% 2.347455e+06
max 2.678810e+06
2 count 7.760000e+03
mean 1.701865e+06
std 7.698233e+05
min 3.164260e+05
25% 9.844118e+05
50% 2.015080e+06
75% 2.371116e+06
max 2.678763e+06
3 count 6.535000e+03
mean 1.582939e+06
std 7.279300e+05
min 3.163110e+05
25% 9.615650e+05
50% 1.766396e+06
75% 2.118306e+06
max 2.678832e+06
4 count 1.227000e+03
mean 1.639939e+06
std 2.121400e+04
min 1.604422e+06
25% 1.632815e+06
50% 1.637908e+06
75% 1.643588e+06
max 1.786686e+06
5 count 1.440000e+03
mean 1.649164e+06
std 3.472618e+04
min 8.675960e+05
25% 1.634522e+06
50% 1.639980e+06
75% 1.644792e+06
max 1.770109e+06
6 count 8.310000e+03
mean 1.696490e+06
std 7.627692e+05
min 3.163150e+05
25% 9.881280e+05
50% 2.010452e+06
75% 2.358236e+06
max 2.678850e+06
cluster_fs1 cluster_fs2 cluster_fs4 retweets \
cluster_fs3
0 count 21101.000000 21101.000000 21101.000000 21101.000000
mean 2.770106 1.755130 1.156012 14.385148
std 1.848761 2.340575 0.608218 266.181865
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 0.000000 1.000000 0.000000
50% 3.000000 0.000000 1.000000 0.000000
75% 4.000000 2.000000 1.000000 1.000000
max 6.000000 6.000000 6.000000 27137.000000
1 count 6136.000000 6136.000000 6136.000000 6136.000000
mean 3.040906 2.971480 3.991362 15.660202
std 2.009239 0.786379 0.152432 172.023894
min 0.000000 0.000000 1.000000 0.000000
25% 1.000000 3.000000 4.000000 0.000000
50% 4.000000 3.000000 4.000000 0.000000
75% 4.000000 3.000000 4.000000 1.000000
max 6.000000 6.000000 6.000000 7649.000000
2 count 9364.000000 9364.000000 9364.000000 9364.000000
mean 2.462196 1.956536 2.436459 7.906130
std 1.497252 2.242091 0.869262 90.135390
min 0.000000 0.000000 0.000000 0.000000
25% 2.000000 0.000000 2.000000 0.000000
50% 2.000000 1.000000 3.000000 0.000000
75% 3.000000 4.000000 3.000000 1.000000
max 6.000000 6.000000 6.000000 5192.000000
3 count 3021.000000 3021.000000 3021.000000 3021.000000
mean 2.701423 1.999338 1.508772 14.857994
std 1.991939 2.295816 1.353790 96.943382
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 0.000000 1.000000 0.000000
50% 3.000000 1.000000 1.000000 0.000000
75% 4.000000 3.000000 1.000000 2.000000
max 6.000000 6.000000 6.000000 1991.000000
4 count 1459.000000 1459.000000 1459.000000 1459.000000
mean 3.984236 4.955449 4.994517 0.016450
std 0.246565 0.458200 0.148047 0.212124
min 0.000000 0.000000 1.000000 0.000000
25% 4.000000 5.000000 5.000000 0.000000
50% 4.000000 5.000000 5.000000 0.000000
75% 4.000000 5.000000 5.000000 0.000000
max 6.000000 6.000000 5.000000 6.000000
5 count 4975.000000 4975.000000 4975.000000 4975.000000
mean 4.365025 1.846432 5.882211 45.420101
std 2.435581 1.834691 0.600402 517.945374
min 0.000000 0.000000 1.000000 0.000000
25% 1.000000 1.000000 6.000000 1.000000
50% 6.000000 1.000000 6.000000 2.000000
75% 6.000000 2.000000 6.000000 8.000000
max 6.000000 6.000000 6.000000 31783.000000
6 count 3944.000000 3944.000000 3944.000000 3944.000000
mean 2.816430 1.676724 0.004310 22.526876
std 2.039127 2.240929 0.094118 149.352267
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 0.000000 0.000000 0.000000
50% 3.000000 0.000000 0.000000 0.000000
75% 4.000000 2.000000 0.000000 3.000000
max 6.000000 6.000000 3.000000 3723.000000
tweet_id
cluster_fs3
0 count 2.110100e+04
mean 1.836213e+06
std 6.824786e+05
min 3.163810e+05
25% 1.362619e+06
50% 2.046974e+06
75% 2.388019e+06
max 2.678886e+06
1 count 6.136000e+03
mean 1.572405e+06
std 7.198915e+05
min 3.163170e+05
25% 9.614235e+05
50% 1.763256e+06
75% 2.115240e+06
max 2.678809e+06
2 count 9.364000e+03
mean 1.179238e+06
std 7.709769e+05
min 3.163160e+05
25% 5.123998e+05
50% 9.583535e+05
75% 1.781745e+06
max 2.678890e+06
3 count 3.021000e+03
mean 1.822107e+06
std 6.996226e+05
min 3.163680e+05
25% 1.362381e+06
50% 2.060101e+06
75% 2.379874e+06
max 2.678850e+06
4 count 1.459000e+03
mean 1.655039e+06
std 8.500740e+04
min 6.169490e+05
25% 1.634538e+06
50% 1.640051e+06
75% 1.644934e+06
max 2.671737e+06
5 count 4.975000e+03
mean 1.741655e+06
std 6.752916e+05
min 3.163310e+05
25% 1.069214e+06
50% 1.831875e+06
75% 2.356020e+06
max 2.678763e+06
6 count 3.944000e+03
mean 2.105259e+06
std 4.979767e+05
min 3.163110e+05
25% 1.998756e+06
50% 2.111720e+06
75% 2.502467e+06
max 2.678849e+06
cluster_fs1 cluster_fs2 cluster_fs3 retweets \
cluster_fs4
0 count 4283.000000 4283.000000 4283.000000 4283.000000
mean 2.799673 1.679897 5.719122 22.739435
std 2.043440 2.252323 0.988376 147.136238
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 0.000000 6.000000 0.000000
50% 3.000000 0.000000 6.000000 0.000000
75% 4.000000 2.000000 6.000000 2.000000
max 6.000000 6.000000 6.000000 3723.000000
1 count 23794.000000 23794.000000 23794.000000 23794.000000
mean 2.706481 1.728083 0.445280 13.427503
std 1.837638 2.346297 0.986194 250.776197
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 0.000000 0.000000 0.000000
50% 3.000000 0.000000 0.000000 0.000000
75% 4.000000 2.000000 0.000000 1.000000
max 6.000000 6.000000 6.000000 27137.000000
2 count 1366.000000 1366.000000 1366.000000 1366.000000
mean 4.737189 3.740849 1.922401 0.614202
std 0.957968 1.022904 0.462337 9.939365
min 0.000000 0.000000 0.000000 0.000000
25% 5.000000 4.000000 2.000000 0.000000
50% 5.000000 4.000000 2.000000 0.000000
75% 5.000000 4.000000 2.000000 0.000000
max 6.000000 6.000000 5.000000 276.000000
3 count 7331.000000 7331.000000 7331.000000 7331.000000
mean 2.197927 1.741918 1.802346 11.303233
std 1.316775 2.269350 0.885586 93.034429
min 0.000000 0.000000 0.000000 0.000000
25% 2.000000 0.000000 2.000000 0.000000
50% 2.000000 1.000000 2.000000 0.000000
75% 2.000000 2.000000 2.000000 1.000000
max 6.000000 6.000000 6.000000 3680.000000
4 count 6771.000000 6771.000000 6771.000000 6771.000000
mean 3.016393 2.963669 1.072958 15.048442
std 2.007175 0.818812 0.506395 165.790559
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 3.000000 1.000000 0.000000
50% 4.000000 3.000000 1.000000 0.000000
75% 4.000000 3.000000 1.000000 1.000000
max 6.000000 6.000000 3.000000 7649.000000
5 count 1508.000000 1508.000000 1508.000000 1508.000000
mean 3.937003 4.855438 3.865385 0.035146
std 0.462971 0.822086 0.720208 0.553170
min 0.000000 0.000000 0.000000 0.000000
25% 4.000000 5.000000 4.000000 0.000000
50% 4.000000 5.000000 4.000000 0.000000
75% 4.000000 5.000000 4.000000 0.000000
max 6.000000 6.000000 4.000000 19.000000
6 count 4947.000000 4947.000000 4947.000000 4947.000000
mean 4.372549 1.834647 4.907823 46.664645
std 2.432520 1.830597 0.542194 525.198659
min 0.000000 0.000000 0.000000 0.000000
25% 1.000000 1.000000 5.000000 1.000000
50% 6.000000 1.000000 5.000000 2.000000
75% 6.000000 2.000000 5.000000 8.000000
max 6.000000 6.000000 5.000000 31783.000000
tweet_id
cluster_fs4
0 count 4.283000e+03
mean 2.107246e+06
std 4.989315e+05
min 3.163110e+05
25% 1.999204e+06
50% 2.114811e+06
75% 2.501846e+06
max 2.678849e+06
1 count 2.379400e+04
mean 1.853439e+06
std 6.643012e+05
min 3.163160e+05
25% 1.495676e+06
50% 2.051500e+06
75% 2.388213e+06
max 2.678890e+06
2 count 1.366000e+03
mean 1.626001e+06
std 2.223969e+05
min 3.271520e+05
25% 1.628331e+06
50% 1.637978e+06
75% 1.644521e+06
max 2.673290e+06
3 count 7.331000e+03
mean 9.792313e+05
std 7.752636e+05
min 3.163500e+05
25% 3.352245e+05
50% 6.101320e+05
75% 1.765522e+06
max 2.678810e+06
4 count 6.771000e+03
mean 1.563080e+06
std 7.261663e+05
min 3.163170e+05
25% 9.569880e+05
50% 1.684840e+06
75% 2.115740e+06
max 2.678832e+06
5 count 1.508000e+03
mean 1.658689e+06
std 1.449377e+05
min 3.224040e+05
25% 1.634538e+06
50% 1.640169e+06
75% 1.645240e+06
max 2.671737e+06
6 count 4.947000e+03
mean 1.761253e+06
std 6.620330e+05
min 3.163310e+05
25% 1.105948e+06
50% 1.836332e+06
75% 2.358328e+06
max 2.678763e+06
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Content source: ProjectsUCSC/NLP
Similar notebooks: