In [23]:

    
from setup_notebooks import *
%matplotlib inline



In [24]:

    
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 12)
pd.set_option('display.max_columns', 200)



In [25]:

    
from gensim.models import TfidfModel, LsiModel
from gensim.corpora import Dictionary



In [26]:

    
dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), engine='python')
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python')
df = pd.read_csv(os.path.join(DATA_PATH, 'text.csv.gz'))

So the new things are LsiModel and scatmat



In [27]:

    
df.tokens









    Out[27]:





0         ['python', 'never', 'stop', 'learning', 'what'...
1                       ['Watching', 'Boa', 'vs', 'Python']
2         ['Monty', 'Python', 'The', 'silly', 'walk', 'v...
3         ['Senior', 'Software', 'Engineer', 'Full', 'St...
4         ['Architect', 'Django', 'Solr', 'Platform', 'E...
5           ['peaceful', 'rain', 'Python', 'inevitability']
                                ...                        
183064    ['Las', 'mejores', 'ides', 'para', 'Python', '...
183065    ['Gagal', 'tidur', 'gegara', 'habis', 'vertica...
183066         ['Go', 'boa', 'wkwk', 'Boa', 'vs', 'Python']
183067    ['RT', 'RealPython', 'List', 'of', 'Python', '...
183068                  ['Watching', 'Boa', 'vs', 'Python']
183069    ['Чертова', 'дюжина', 'вакансий', 'в', 'IT', '...
Name: tokens, dtype: object

Load cleaned tweet data
Don't forget to fix up the tokens!
Can you think of a better way to save a list of lists of strings? What about the raw, unprocessed unicode tweet text itself?

Load the vocabulary we built earlier

Or recompile the vocabulary from the texts (tokens)



In [28]:

    
vocab = Dictionary(df.txt.str.split())
print(vocab)









    



Dictionary(87147 unique tokens: ['AGARLIFE', 'WITH', 'PSHACK', 'Harassment', 'HoneypotOnMic']...)

Everything the TFIDF needs is in the Dictionary (vocabulary)



In [29]:

    
tfidf = TfidfModel(id2word=vocab, dictionary=vocab)
tfidf.num_docs









    Out[29]:





183070



In [30]:

    
bows = pd.Series(vocab.doc2bow(toks) for toks in df.txt.str.split())
bows









    Out[30]:





0         [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...
1                        [(8, 1), (9, 1), (10, 1), (11, 1)]
2         [(11, 1), (12, 1), (13, 1), (14, 1), (15, 1), ...
3         [(11, 1), (18, 1), (19, 1), (20, 1), (21, 1), ...
4         [(11, 1), (18, 1), (20, 1), (22, 1), (23, 1), ...
5                      [(11, 1), (37, 1), (38, 1), (39, 1)]
                                ...                        
183064    [(0, 1), (11, 1), (71, 1), (3253, 1), (3530, 1...
183065    [(8, 1), (9, 1), (11, 1), (135, 1), (1298, 1),...
183066    [(8, 1), (9, 1), (11, 1), (1664, 1), (3956, 1)...
183067    [(0, 1), (11, 1), (58, 1), (239, 1), (297, 2),...
183068                   [(8, 1), (9, 1), (10, 1), (11, 1)]
183069    [(11, 1), (24, 1), (651, 1), (1664, 1), (13176...
dtype: object

This would make a nice, compact sparse matrix representation of our entire corpus...
Which would mean we could do more in RAM at once.
Left as an exercise. (check out scipy.sparse.coo_matrix)



In [14]:

    
vocab.token2id['publishes']









    Out[14]:





1275



In [16]:

    
vocab[0]









    Out[16]:





'python'



In [17]:

    
vocab.token2id['python']









    Out[17]:





0



In [18]:

    
vocab.token2id['Python']









    Out[18]:





11



In [20]:

    
dict([(vocab[i], round(freq, 2)) for i, freq in tfidf[bows[0]]])









    Out[20]:





{'doing': 0.38,
 'enjoy': 0.53,
 'learning': 0.29,
 'never': 0.4,
 'python': 0.08,
 'stop': 0.42,
 'what': 0.32,
 'you': 0.2}

Notice how "you" didn't get as much weight as "enjoy"
Let's look at some other tweets



In [31]:

    
from gensim.models import LsiModel
lsi = LsiModel(tfidf[bows], num_topics=100, id2word=vocab, extra_samples=100, power_iters=2)
# lsi = LsiModel.load(os.path.join(DATA_PATH, 'lsi'))
len(lsi.id2word)









    Out[31]:





87147

This is starting to look a lot like a set of vectors that we could use as features
But wait, if I used the IDs as the vector index (column) numbers, how many features or "columns" would I have?



In [32]:

    
len(vocab)









    Out[32]:





87147

100k dimensions isn't a good idea
Even for a masively parallel deep learning project this would be big
Like the cat/dog picture classification on 256x256 images
What about PCA (Principal Component Analysis) like is used on images?
In NLP PCA is called LSI (Latent Semantic Analysis)
That sounds cool!
I want me some latent semantics (hidden meaning)



In [36]:

    
lsi.save(os.path.join(DATA_PATH, 'lsi'))



In [69]:

    
lsi4 = LsiModel(bows, num_topics=4, id2word=vocab, extra_samples=100, power_iters=2)
lsi4.save(os.path.join(DATA_PATH, 'lsi4'))
lsi4









    Out[69]:





<gensim.models.lsimodel.LsiModel at 0x7f2ab8e4fda0>



In [41]:

    
topics = lsi4[bows]
df_topics = pd.DataFrame([dict(d) for d in topics], index=df.index, columns=range(5))



In [ ]:

    
df_topics4['favorites'] = (nums.favorite_count > 0).astype(int)



In [60]:

    
df_topics4['favorites'] = np.ceil(nums.favorite_count ** .13).astype(int)
df_topics4









    Out[60]:






  
    
      
      0
      1
      2
      3
      4
      favorites
    
  
  
    
      0
      0.279246
      0.644025
      -0.159041
      -0.259208
      -0.128964
      0
    
    
      1
      0.664247
      -0.578260
      0.144901
      0.044227
      -0.093921
      0
    
    
      2
      0.728053
      -0.587551
      0.142273
      0.075186
      -0.129895
      1
    
    
      3
      0.874018
      -1.025440
      0.506672
      -0.843611
      -0.181155
      0
    
    
      4
      1.082223
      -0.939508
      0.617143
      -1.436531
      -0.077277
      0
    
    
      5
      0.657642
      -0.572345
      0.143017
      0.041250
      -0.089767
      0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      183064
      0.869770
      0.011683
      -0.046547
      -0.247101
      -0.212515
      0
    
    
      183065
      0.702997
      -0.564605
      0.146534
      0.052112
      -0.100772
      0
    
    
      183066
      0.670191
      -0.580436
      0.146313
      0.042536
      -0.096954
      0
    
    
      183067
      1.427827
      0.111896
      -0.859894
      -0.095688
      -0.424858
      0
    
    
      183068
      0.664247
      -0.578260
      0.144901
      0.044227
      -0.093921
      0
    
    
      183069
      0.681229
      -0.602672
      0.166678
      -0.015207
      -0.101250
      0
    
  

183070 rows × 6 columns



In [68]:

    
scatmat(df_topics, num_topics=4)



In [1]:

    
df = pd.DataFrame.from_csv(os.path.join(DATA_PATH, 'text.csv.gz'))
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), compression='gzip', engine='python')
vocab = Dictionary.from_documents(([str(s) for s in row]for row in df.txt.str.split()))
tfidf = TfidfModel(id2word=vocab, dictionary=vocab)
lsi = LsiModel.load(os.path.join(DATA_PATH, 'lsi'))
bows = pd.Series(vocab.doc2bow(toks) for toks in df.txt.str.split())
topics = lsi[bows]









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-5a36a329d86f> in <module>()
----> 1 df = pd.DataFrame.from_csv(os.path.join(DATA_PATH, 'text.csv.gz'))
      2 numbers = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), compression='gzip', engine='python')
      3 vocab = Dictionary.from_documents(([str(s) for s in row]for row in df.txt.str.split()))
      4 tfidf = TfidfModel(id2word=vocab, dictionary=vocab)
      5 lsi = LsiModel.load(os.path.join(DATA_PATH, 'lsi'))

NameError: name 'pd' is not defined



In [71]:

    
df_topics









    Out[71]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
      57
      58
      59
      60
      61
      62
      63
      64
      65
      66
      67
      68
      69
      70
      71
      72
      73
      74
      75
      76
      77
      78
      79
      80
      81
      82
      83
      84
      85
      86
      87
      88
      89
      90
      91
      92
      93
      94
      95
      96
      97
      98
      99
      favorites
    
  
  
    
      0
      0.280172
      0.645190
      -0.157156
      -0.327782
      0.015415
      -0.014999
      0.483952
      -0.154137
      0.240799
      0.063167
      0.119363
      0.094626
      -0.135290
      0.122140
      0.193363
      0.002697
      0.068299
      0.025904
      0.248716
      0.692007
      0.423768
      -0.102020
      -0.349437
      0.054404
      0.103062
      -0.033871
      -0.016174
      0.010853
      0.084489
      -0.128339
      -0.136022
      -0.004152
      0.094123
      0.015972
      0.110873
      0.125792
      -0.010306
      -0.002119
      -0.114695
      -0.026637
      -0.013192
      0.052914
      -0.041225
      -0.050741
      0.061092
      -0.030267
      0.034378
      -0.012754
      0.018169
      0.057731
      0.104373
      0.007469
      -0.049100
      -0.004169
      0.042586
      -0.058043
      -0.092664
      0.076392
      0.074233
      -0.018681
      0.089404
      0.009699
      0.040680
      -0.049278
      -0.019616
      0.029592
      0.018924
      0.022357
      0.051319
      0.046523
      0.030449
      0.067461
      -0.001596
      0.007642
      -0.013412
      -0.031710
      0.000384
      0.023221
      -0.006413
      0.108972
      0.104001
      0.048431
      0.049651
      0.008886
      -0.010492
      -0.141836
      0.097200
      0.085994
      0.021212
      -0.094674
      0.046784
      0.046888
      0.148451
      -0.007399
      0.185036
      -0.088651
      0.088479
      0.049042
      0.070358
      0.071591
      0
    
    
      1
      0.663715
      -0.580478
      0.148490
      0.057189
      -0.037231
      -0.000197
      0.173045
      -0.127133
      0.055935
      -0.015247
      0.250299
      0.067773
      -0.171196
      0.010900
      0.061693
      -0.080983
      -0.082421
      -0.006501
      -0.091766
      0.032953
      0.072217
      -0.033408
      -0.046145
      0.024451
      -0.081986
      -0.012791
      0.026987
      -0.022037
      -0.036708
      0.000637
      0.033873
      0.054592
      -0.013473
      -0.014232
      -0.031760
      0.047070
      0.018214
      0.012422
      -0.009772
      -0.000617
      0.007393
      0.040041
      0.044413
      -0.079550
      0.029095
      -0.022515
      0.034118
      -0.033704
      0.028967
      0.022471
      0.021409
      0.004848
      -0.049721
      0.026330
      -0.019879
      -0.077067
      -0.019985
      0.027985
      0.009797
      0.052440
      0.001960
      0.048176
      -0.084344
      -0.019261
      -0.049036
      -0.052190
      0.014431
      -0.032369
      -0.011042
      -0.014681
      -0.069244
      0.005356
      -0.077368
      -0.027810
      -0.004997
      -0.050013
      -0.084956
      -0.020075
      0.050919
      0.059778
      -0.013075
      0.030441
      -0.037697
      -0.072826
      0.141673
      0.005741
      0.029627
      -0.069367
      0.012395
      -0.030158
      0.044979
      -0.000945
      0.076312
      0.032291
      0.039446
      -0.035483
      -0.016841
      0.081544
      0.029812
      -0.007467
      0
    
    
      2
      0.727361
      -0.587704
      0.145206
      0.082223
      -0.057350
      -0.011299
      0.201314
      -0.158187
      0.060609
      -0.023160
      0.338061
      0.064668
      -0.240718
      0.005614
      0.114138
      -0.036048
      -0.087398
      0.068509
      -0.165800
      -0.051961
      0.065984
      -0.053798
      -0.043702
      0.243280
      -0.088352
      -0.027219
      -0.030730
      -0.013177
      0.175770
      -0.378035
      -0.005225
      -0.230089
      0.104294
      0.350606
      -0.182940
      -0.448534
      -0.056096
      0.004929
      -0.154000
      0.014849
      0.097148
      0.042973
      0.008829
      -0.396335
      -0.167250
      0.156450
      0.107210
      -0.517160
      -0.631082
      -0.392434
      0.015933
      -0.078136
      -0.112470
      -0.055096
      -0.102148
      0.079702
      -0.023785
      0.069265
      0.198604
      -0.004287
      -0.212030
      0.039328
      0.258817
      -0.003450
      0.002982
      -0.180837
      -0.057673
      0.110186
      -0.120306
      0.085802
      -0.039256
      -0.014410
      -0.151422
      -0.006477
      0.009820
      -0.011455
      -0.124494
      0.048210
      -0.022603
      -0.039890
      0.011741
      -0.015465
      -0.014987
      -0.050109
      0.002109
      -0.000518
      -0.077904
      -0.038942
      0.002526
      0.076421
      -0.047575
      -0.046619
      -0.026164
      0.008935
      -0.062166
      0.028394
      0.010168
      0.106901
      -0.012600
      0.119668
      1
    
    
      3
      0.875706
      -1.042393
      0.488922
      -1.032760
      -0.026749
      -0.088396
      0.359527
      -0.506103
      0.808462
      -0.033757
      -1.072789
      -0.704033
      0.222333
      -0.100345
      -0.106241
      0.182381
      0.086618
      0.027355
      -0.282973
      0.051965
      0.003404
      0.093949
      -0.116667
      -0.187203
      0.133163
      0.054593
      0.204164
      -0.564098
      -0.031973
      -0.128866
      0.032662
      -0.094287
      0.083921
      -0.174592
      -0.004803
      -0.083423
      0.015334
      -0.006701
      -0.073123
      -0.105053
      -0.037366
      -0.082046
      0.211543
      -0.063374
      0.235030
      -0.176463
      0.002135
      0.024126
      -0.192945
      -0.104253
      -0.159646
      0.417474
      0.080754
      -0.158687
      0.045846
      0.128120
      0.196454
      0.031857
      -0.020152
      -0.278651
      -0.204720
      0.048018
      -0.088910
      0.291889
      -0.622359
      0.045088
      0.492575
      0.147261
      0.620369
      0.053716
      0.232457
      -0.304015
      -0.121995
      0.187564
      0.022024
      0.296733
      0.182747
      -0.053161
      0.038762
      0.102695
      0.109263
      0.069359
      0.103465
      -0.002159
      0.155772
      -0.198050
      -0.184370
      -0.082029
      -0.059093
      0.122648
      -0.004657
      0.012845
      -0.171109
      -0.065439
      -0.032612
      0.078941
      0.022546
      -0.011703
      0.002504
      -0.064791
      0
    
    
      4
      1.085688
      -0.960564
      0.554194
      -1.653842
      0.096192
      -0.091327
      -0.031368
      -0.076306
      0.324141
      -0.018115
      -0.781173
      -0.529770
      0.241301
      -0.179209
      -0.107388
      0.257473
      0.177277
      0.164085
      -0.302419
      0.071570
      0.038746
      0.027558
      0.012607
      -0.062721
      -0.070035
      0.029534
      0.015812
      0.177176
      0.022005
      0.044712
      -0.146183
      0.139087
      0.700109
      -0.581963
      -0.143590
      -0.468450
      0.133108
      -0.111055
      0.041903
      -0.050167
      -0.042156
      0.033517
      -0.291797
      -0.288576
      -0.137058
      0.003325
      0.066265
      0.006379
      0.131540
      -0.017310
      0.171023
      0.376294
      -0.065476
      -0.041911
      0.084034
      0.002309
      0.012166
      0.027025
      -0.088878
      0.016200
      -0.255696
      0.080371
      0.030683
      0.165323
      -0.149626
      0.085858
      0.327007
      0.036865
      0.498190
      -0.098803
      0.129879
      -0.475497
      -0.264184
      0.363755
      0.088963
      -0.049962
      0.054467
      0.142719
      -0.235419
      -0.083084
      -0.097935
      -0.120792
      0.075250
      0.080061
      -0.277833
      0.512237
      0.377193
      0.018365
      0.274480
      -0.194636
      -0.054242
      -0.071957
      -0.021973
      0.254511
      0.055936
      0.048872
      0.029358
      0.105685
      0.110517
      0.105955
      0
    
    
      5
      0.657078
      -0.574435
      0.146859
      0.053880
      -0.033924
      -0.001251
      0.167541
      -0.122287
      0.057057
      -0.017873
      0.240668
      0.065817
      -0.162353
      0.009816
      0.059710
      -0.073933
      -0.077225
      -0.009345
      -0.083304
      0.031927
      0.069392
      -0.030891
      -0.043517
      0.010201
      -0.079881
      -0.008722
      0.029346
      -0.017065
      -0.029565
      -0.011183
      0.036699
      0.053240
      -0.012904
      -0.018570
      -0.037655
      0.043325
      0.029548
      0.002025
      0.003941
      -0.003104
      -0.000948
      0.024912
      0.024587
      -0.047901
      0.029155
      -0.041217
      0.018130
      -0.013738
      0.064594
      0.025579
      0.015837
      0.006048
      -0.030821
      0.024971
      0.023190
      -0.008638
      -0.012581
      0.009834
      -0.007077
      -0.004522
      0.016982
      -0.004209
      0.004786
      0.007768
      -0.000191
      0.006214
      0.015738
      -0.008915
      -0.004759
      0.018011
      -0.003371
      0.009383
      0.022910
      -0.000116
      -0.005423
      -0.007377
      -0.004678
      0.008182
      -0.015476
      0.010809
      0.008261
      -0.007873
      -0.013895
      -0.009209
      0.001693
      -0.006578
      0.011219
      0.018944
      0.013345
      -0.014912
      0.001883
      0.001696
      0.004464
      0.005948
      0.010276
      -0.017412
      0.007222
      -0.006720
      -0.000853
      0.011416
      0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      183064
      0.870214
      0.010245
      -0.039343
      -0.307574
      -0.027283
      -0.043868
      0.649458
      -0.304471
      0.293735
      -0.020160
      0.415650
      0.128164
      -0.308508
      0.060289
      0.091918
      -0.178705
      -0.102736
      0.027722
      -0.120563
      0.071041
      0.148020
      -0.045126
      -0.050886
      0.088566
      -0.139642
      -0.032411
      0.088933
      -0.015483
      -0.041371
      -0.059783
      0.077925
      0.095152
      -0.027145
      -0.034247
      -0.059308
      0.086783
      0.088112
      0.025374
      0.011310
      0.029846
      0.053767
      0.000593
      0.047426
      -0.083587
      0.066583
      -0.043341
      0.040275
      -0.018947
      0.118815
      0.053220
      0.056294
      0.014738
      -0.073742
      0.047790
      0.035142
      -0.021582
      -0.021536
      -0.024907
      -0.023140
      0.045803
      0.019833
      -0.006597
      -0.052378
      0.088620
      -0.037757
      -0.008986
      -0.012068
      -0.068646
      0.078612
      0.068233
      -0.105906
      0.084157
      -0.056192
      -0.052821
      -0.019871
      0.023396
      -0.029417
      0.042341
      -0.038953
      0.141016
      -0.097466
      -0.110872
      -0.052821
      0.231681
      -0.070912
      -0.127818
      -0.003268
      0.068127
      -0.100572
      0.138942
      -0.161589
      -0.106263
      -0.097585
      0.144002
      0.081436
      -0.052898
      -0.051156
      0.087084
      -0.072030
      0.032930
      0
    
    
      183065
      0.702398
      -0.566650
      0.147322
      0.059966
      -0.046549
      0.034622
      0.168766
      -0.144736
      0.052148
      -0.014557
      0.256829
      0.036565
      -0.175150
      -0.006182
      0.104387
      -0.085452
      -0.059369
      0.000413
      -0.044223
      0.030992
      0.093358
      -0.026165
      -0.067459
      -0.004504
      -0.170167
      0.053224
      0.114394
      0.130597
      -0.265979
      -0.103368
      0.426941
      -0.462831
      -0.162974
      -0.242273
      0.412028
      -0.037502
      0.084377
      -0.088965
      0.034608
      -0.245478
      -0.073023
      0.113115
      -0.020574
      -0.097963
      -0.077986
      0.042122
      0.017682
      -0.019187
      0.062740
      -0.009568
      0.070867
      0.004646
      -0.068482
      0.056238
      -0.072173
      -0.087328
      -0.044252
      0.031166
      0.006190
      0.052294
      -0.001141
      0.070130
      -0.085887
      0.012772
      -0.041610
      -0.083715
      -0.052863
      0.012817
      -0.015115
      -0.005299
      -0.081763
      0.000014
      -0.057155
      -0.042348
      -0.016748
      -0.041563
      -0.086302
      -0.040241
      0.054473
      0.054835
      -0.009244
      0.043136
      -0.026399
      -0.094882
      0.154029
      0.017260
      0.040346
      -0.081129
      0.020335
      -0.029239
      0.051447
      -0.008598
      0.077724
      0.041206
      0.022885
      -0.040811
      -0.009778
      0.094098
      0.047725
      -0.018049
      0
    
    
      183066
      0.669663
      -0.582709
      0.149469
      0.053910
      -0.036853
      0.000214
      0.174244
      -0.123272
      0.061204
      -0.019881
      0.248949
      0.068679
      -0.162073
      0.010776
      0.062365
      -0.076171
      -0.080251
      -0.005891
      -0.089111
      0.036360
      0.075552
      -0.033255
      -0.051043
      0.023449
      -0.086034
      -0.019569
      0.025187
      -0.029774
      -0.039552
      -0.007480
      0.037181
      0.054541
      -0.011960
      -0.006722
      -0.034009
      0.041905
      0.010608
      0.016141
      -0.007724
      0.001931
      0.009560
      0.042889
      0.048000
      -0.082342
      0.036323
      -0.015598
      0.033799
      -0.028567
      0.027999
      0.020692
      0.026421
      -0.002638
      -0.055783
      0.037341
      -0.014573
      -0.087102
      -0.022718
      0.017238
      0.000539
      0.060406
      -0.007583
      0.064161
      -0.110648
      -0.023333
      -0.056933
      -0.069073
      0.013472
      -0.029940
      -0.029544
      -0.022402
      -0.076554
      0.011847
      -0.092621
      -0.047525
      0.007550
      -0.047268
      -0.104397
      -0.016156
      0.030653
      0.075597
      -0.035328
      0.028252
      -0.031324
      -0.096606
      0.161888
      0.032422
      0.061476
      -0.077350
      0.029908
      -0.055633
      0.054607
      -0.009457
      0.054790
      -0.010603
      0.050136
      -0.017191
      -0.039859
      0.072055
      0.036292
      0.017400
      0
    
    
      183067
      1.428745
      0.121274
      -0.840299
      -0.152442
      -0.252583
      -0.157892
      0.270245
      -0.584348
      0.480387
      -0.182970
      1.240259
      -0.634805
      1.154241
      0.494578
      -0.622291
      0.137708
      0.125598
      -0.191302
      -0.049397
      0.051531
      -0.015288
      -0.224928
      -0.205462
      0.031812
      -0.083537
      -0.182684
      0.089219
      0.035190
      -0.058811
      -0.064603
      0.077887
      0.147615
      0.014923
      0.063822
      -0.040497
      0.105330
      0.043142
      0.023293
      -0.064266
      -0.101640
      0.084103
      -0.050603
      0.002113
      -0.158812
      0.049550
      0.008803
      0.089982
      0.004476
      0.094374
      0.062600
      -0.003256
      -0.015440
      0.032174
      0.013549
      0.079834
      -0.044131
      -0.028855
      0.081283
      -0.013127
      -0.054737
      -0.025943
      0.008279
      0.006034
      0.019395
      -0.051227
      -0.021100
      0.039805
      0.015640
      -0.069415
      -0.074022
      -0.047591
      0.007868
      0.051574
      0.026596
      -0.057869
      0.051712
      -0.015805
      0.036133
      0.037438
      0.093048
      0.014121
      0.012448
      0.001563
      0.060706
      0.018820
      0.064614
      0.074386
      -0.028317
      0.023688
      -0.012575
      -0.020834
      -0.077714
      0.035732
      -0.015738
      0.028247
      -0.006147
      0.008858
      -0.024482
      -0.083301
      0.008295
      0
    
    
      183068
      0.663715
      -0.580478
      0.148490
      0.057189
      -0.037231
      -0.000197
      0.173045
      -0.127133
      0.055935
      -0.015247
      0.250299
      0.067773
      -0.171196
      0.010900
      0.061693
      -0.080983
      -0.082421
      -0.006501
      -0.091766
      0.032953
      0.072217
      -0.033408
      -0.046145
      0.024451
      -0.081986
      -0.012791
      0.026987
      -0.022037
      -0.036708
      0.000637
      0.033873
      0.054592
      -0.013473
      -0.014232
      -0.031760
      0.047070
      0.018214
      0.012422
      -0.009772
      -0.000617
      0.007393
      0.040041
      0.044413
      -0.079550
      0.029095
      -0.022515
      0.034118
      -0.033704
      0.028967
      0.022471
      0.021409
      0.004848
      -0.049721
      0.026330
      -0.019879
      -0.077067
      -0.019985
      0.027985
      0.009797
      0.052440
      0.001960
      0.048176
      -0.084344
      -0.019261
      -0.049036
      -0.052190
      0.014431
      -0.032369
      -0.011042
      -0.014681
      -0.069244
      0.005356
      -0.077368
      -0.027810
      -0.004997
      -0.050013
      -0.084956
      -0.020075
      0.050919
      0.059778
      -0.013075
      0.030441
      -0.037697
      -0.072826
      0.141673
      0.005741
      0.029627
      -0.069367
      0.012395
      -0.030158
      0.044979
      -0.000945
      0.076312
      0.032291
      0.039446
      -0.035483
      -0.016841
      0.081544
      0.029812
      -0.007467
      0
    
    
      183069
      0.680829
      -0.605563
      0.168112
      -0.019684
      -0.035082
      -0.004944
      0.187525
      -0.152053
      0.104167
      -0.019865
      0.158488
      0.022164
      -0.132383
      0.004857
      0.056615
      -0.064475
      -0.086735
      -0.010399
      -0.027155
      0.022303
      0.063060
      -0.026450
      -0.045680
      -0.005677
      -0.074802
      -0.012505
      0.043404
      -0.069921
      -0.035012
      -0.035942
      0.063313
      0.005194
      -0.018277
      -0.045093
      -0.016679
      0.045723
      -0.021070
      0.021675
      -0.048409
      -0.017206
      -0.010542
      -0.032385
      0.210582
      0.012948
      0.167391
      -0.113386
      -0.022223
      -0.018279
      -0.037268
      0.047014
      -0.120914
      -0.014670
      0.039569
      -0.045413
      -0.003052
      0.078268
      0.092287
      -0.040370
      -0.007169
      -0.000527
      0.072509
      -0.022569
      0.105022
      0.025033
      0.092175
      -0.003359
      -0.021866
      -0.028068
      -0.073059
      -0.020781
      -0.033068
      -0.002902
      -0.024298
      -0.069612
      -0.009512
      -0.009712
      -0.037209
      0.110440
      0.010992
      0.019845
      0.034989
      0.015729
      0.028723
      -0.022046
      0.016285
      0.113497
      0.018043
      -0.013633
      0.056959
      -0.005187
      0.010248
      0.003958
      0.001884
      0.033301
      0.073964
      -0.007646
      -0.040707
      -0.010739
      0.061213
      0.036570
      0
    
  

183070 rows × 101 columns



In [73]:

    
scatmat(df_topics)

That's Fast!

What happened to the GIL?
The gilectomy talk isn't until tomorrow!
Can Python do that?
With numpy and gensim it can.

What's that sound I hear?
That's the sound mof your fans blowing hot air out of those tweets!
(check out your system monitor or htop)



In [28]:

    
tweetids = pd.Series(range(6), name='tweet')
topicids = pd.Series(range(lsi.num_topics), name='topic')
pd.DataFrame([pd.Series([x[1] for x in lsi[bows[i]]], index=topicids,
                        name='tweet') for i in tweetids],
             index=tweetids)



In [30]:

    
lsi.save(os.path.join(DATA_PATH, 'lsi'))
lsi5.save(os.path.join(DATA_PATH, 'lsi5'))



In [5]:

    
lsi5 = LsiModel.load(os.path.join(DATA_PATH, 'lsi5'))



In [9]:

    
# for topic in lsi.show_topics():
#     print(topic)
lsi5.show_topic(0, 8)









    Out[9]:





[('Python', 0.65726304271525138),
 ('RT', 0.37730218261284065),
 ('to', 0.24256210419921637),
 ('a', 0.2425115388559333),
 ('in', 0.21979729720296687),
 ('python', 0.19540522901071722),
 ('the', 0.17746464991064059),
 ('and', 0.16362893535605519)]



In [10]:

    
lsi5.show_topic(0, 8)









    Out[10]:





[('Python', 0.65726304271525138),
 ('RT', 0.37730218261284065),
 ('to', 0.24256210419921637),
 ('a', 0.2425115388559333),
 ('in', 0.21979729720296687),
 ('python', 0.19540522901071722),
 ('the', 0.17746464991064059),
 ('and', 0.16362893535605519)]

Hold onto your hat

This will take a lot of RAM!
(and CPU)



In [31]:

    
tweetids = pd.Series(range(len(bows)), name='tweet')
topicids = pd.Series(range(lsi.num_topics), name='topic')
# `dict()` keeps track of the columns for each topic, in case the lsi model shuffles or skips topics for odd tweets
df = pd.DataFrame([pd.Series(dict(lsi[bows[i]]), name='tweet') for i in tweetids],
                  columns=topicids,
                  index=tweetids)



In [32]:

    
df









    Out[32]:






  
    
      topic
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
      57
      58
      59
      60
      61
      62
      63
      64
      65
      66
      67
      68
      69
      70
      71
      72
      73
      74
      75
      76
      77
      78
      79
      80
      81
      82
      83
      84
      85
      86
      87
      88
      89
      90
      91
      92
      93
      94
      95
      96
      97
      98
      99
    
    
      tweet
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      0.28
      0.65
      -0.16
      -0.33
      0.02
      -1.45e-02
      0.48
      -0.15
      0.24
      0.06
      -0.12
      -0.10
      -0.14
      1.24e-01
      0.19
      2.88e-03
      0.07
      -2.61e-02
      0.25
      0.69
      0.42
      0.10
      0.35
      -5.55e-02
      0.10
      -0.03
      -0.02
      0.01
      -0.08
      -1.27e-01
      -0.14
      -1.18e-03
      0.09
      0.01
      0.11
      0.12
      0.01
      3.88e-03
      1.14e-01
      -2.34e-02
      8.72e-03
      -0.06
      -3.45e-02
      0.06
      0.06
      -0.03
      0.04
      7.79e-03
      -0.02
      -0.06
      -9.21e-02
      -0.04
      -0.05
      -2.77e-03
      -4.78e-02
      0.05
      0.10
      -0.07
      -5.99e-02
      -4.41e-02
      9.86e-02
      -7.40e-03
      0.03
      6.66e-02
      -5.87e-03
      5.12e-02
      3.48e-03
      4.41e-02
      -0.03
      -2.98e-02
      -0.05
      -0.05
      7.93e-03
      -3.02e-02
      2.57e-02
      -0.06
      0.03
      -5.06e-03
      -2.56e-02
      -0.02
      -9.39e-02
      3.91e-02
      0.13
      -0.02
      -6.72e-02
      -0.13
      0.05
      -0.09
      -0.04
      -5.32e-02
      -4.56e-02
      8.33e-03
      2.59e-02
      1.35e-01
      0.15
      1.73e-01
      -0.09
      0.07
      0.05
      0.03
    
    
      1
      0.66
      -0.58
      0.15
      0.06
      -0.04
      1.83e-04
      0.17
      -0.13
      0.06
      -0.02
      -0.25
      -0.07
      -0.17
      1.07e-02
      0.06
      -8.09e-02
      -0.08
      5.90e-03
      -0.09
      0.03
      0.07
      0.03
      0.05
      -2.42e-02
      -0.08
      -0.01
      0.03
      -0.02
      0.04
      -5.06e-04
      0.04
      5.39e-02
      -0.01
      -0.01
      -0.03
      0.05
      -0.02
      -1.54e-02
      8.40e-03
      1.47e-03
      -1.06e-02
      -0.04
      5.33e-02
      0.08
      0.03
      -0.02
      0.05
      -2.03e-02
      -0.03
      -0.03
      -1.89e-02
      -0.01
      -0.05
      2.78e-02
      2.32e-02
      0.07
      0.03
      -0.03
      -8.60e-03
      5.39e-02
      3.34e-03
      -2.85e-02
      -0.08
      1.60e-02
      6.41e-02
      -5.40e-02
      2.66e-02
      -7.94e-03
      -0.03
      6.12e-04
      -0.05
      0.05
      -6.44e-02
      1.57e-02
      -2.44e-02
      -0.04
      0.10
      -3.95e-02
      -1.41e-01
      0.06
      4.19e-02
      1.92e-02
      -0.07
      0.02
      -3.87e-03
      0.09
      -0.02
      0.05
      0.02
      -2.28e-02
      -1.53e-03
      1.40e-02
      1.16e-02
      -8.98e-03
      0.10
      2.44e-03
      0.06
      0.03
      -0.01
      0.04
    
    
      2
      0.73
      -0.59
      0.15
      0.08
      -0.06
      -1.08e-02
      0.20
      -0.16
      0.06
      -0.02
      -0.34
      -0.07
      -0.24
      5.88e-03
      0.11
      -3.67e-02
      -0.09
      -6.82e-02
      -0.17
      -0.05
      0.07
      0.05
      0.04
      -2.43e-01
      -0.09
      -0.03
      -0.03
      -0.01
      -0.18
      -3.84e-01
      -0.02
      -2.27e-01
      0.10
      0.35
      -0.19
      -0.45
      0.05
      -1.09e-02
      1.48e-01
      3.17e-02
      -9.25e-02
      -0.02
      3.06e-02
      0.38
      -0.14
      0.13
      0.30
      -4.44e-01
      0.62
      0.42
      -8.57e-02
      0.09
      -0.12
      -4.98e-02
      1.01e-01
      -0.08
      0.02
      -0.10
      -1.96e-01
      -2.29e-02
      -1.88e-01
      -1.11e-01
      0.24
      7.44e-03
      -3.80e-02
      -1.81e-01
      -2.32e-02
      2.96e-02
      0.15
      -8.53e-02
      -0.04
      0.02
      -1.66e-01
      9.72e-03
      -1.06e-02
      0.01
      0.13
      2.63e-02
      3.79e-02
      0.02
      -6.93e-03
      -5.94e-03
      -0.05
      0.05
      6.27e-02
      0.02
      -0.04
      0.03
      0.06
      1.42e-02
      -2.90e-03
      2.73e-02
      -4.46e-02
      -3.45e-02
      0.03
      -5.64e-02
      0.04
      0.08
      0.01
      0.04
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      183067
      1.43
      0.12
      -0.84
      -0.15
      -0.25
      -1.56e-01
      0.27
      -0.58
      0.48
      -0.18
      -1.24
      0.63
      1.15
      4.92e-01
      -0.63
      1.37e-01
      0.13
      1.92e-01
      -0.05
      0.05
      -0.02
      0.22
      0.21
      -3.17e-02
      -0.08
      -0.18
      0.09
      0.04
      0.06
      -6.53e-02
      0.08
      1.47e-01
      0.01
      0.06
      -0.04
      0.11
      -0.04
      -2.39e-02
      6.61e-02
      -9.21e-02
      -9.32e-02
      0.06
      -3.46e-03
      0.16
      0.05
      0.02
      0.08
      2.65e-02
      -0.09
      -0.07
      -2.46e-03
      0.02
      0.04
      1.63e-02
      -7.69e-02
      0.04
      0.04
      -0.06
      1.95e-02
      -6.10e-02
      -2.33e-02
      -1.75e-02
      0.01
      -1.38e-02
      5.04e-02
      -3.35e-02
      4.59e-02
      -4.45e-02
      0.06
      8.02e-02
      -0.04
      0.04
      4.99e-02
      -3.60e-03
      4.03e-02
      0.03
      0.04
      2.95e-02
      -6.37e-02
      -0.07
      -6.74e-02
      -3.98e-02
      0.05
      -0.06
      -5.73e-02
      0.03
      0.05
      0.02
      0.02
      9.91e-03
      1.63e-02
      7.45e-02
      2.04e-03
      6.69e-02
      0.03
      3.06e-02
      -0.05
      -0.03
      0.10
      -0.08
    
    
      183068
      0.66
      -0.58
      0.15
      0.06
      -0.04
      1.83e-04
      0.17
      -0.13
      0.06
      -0.02
      -0.25
      -0.07
      -0.17
      1.07e-02
      0.06
      -8.09e-02
      -0.08
      5.90e-03
      -0.09
      0.03
      0.07
      0.03
      0.05
      -2.42e-02
      -0.08
      -0.01
      0.03
      -0.02
      0.04
      -5.06e-04
      0.04
      5.39e-02
      -0.01
      -0.01
      -0.03
      0.05
      -0.02
      -1.54e-02
      8.40e-03
      1.47e-03
      -1.06e-02
      -0.04
      5.33e-02
      0.08
      0.03
      -0.02
      0.05
      -2.03e-02
      -0.03
      -0.03
      -1.89e-02
      -0.01
      -0.05
      2.78e-02
      2.32e-02
      0.07
      0.03
      -0.03
      -8.60e-03
      5.39e-02
      3.34e-03
      -2.85e-02
      -0.08
      1.60e-02
      6.41e-02
      -5.40e-02
      2.66e-02
      -7.94e-03
      -0.03
      6.12e-04
      -0.05
      0.05
      -6.44e-02
      1.57e-02
      -2.44e-02
      -0.04
      0.10
      -3.95e-02
      -1.41e-01
      0.06
      4.19e-02
      1.92e-02
      -0.07
      0.02
      -3.87e-03
      0.09
      -0.02
      0.05
      0.02
      -2.28e-02
      -1.53e-03
      1.40e-02
      1.16e-02
      -8.98e-03
      0.10
      2.44e-03
      0.06
      0.03
      -0.01
      0.04
    
    
      183069
      0.68
      -0.61
      0.17
      -0.02
      -0.03
      -4.67e-03
      0.19
      -0.15
      0.10
      -0.02
      -0.16
      -0.02
      -0.13
      5.10e-03
      0.06
      -6.46e-02
      -0.09
      1.04e-02
      -0.03
      0.02
      0.06
      0.03
      0.05
      5.36e-03
      -0.07
      -0.01
      0.04
      -0.07
      0.04
      -3.81e-02
      0.06
      5.59e-03
      -0.02
      -0.04
      -0.02
      0.05
      0.02
      -2.08e-02
      5.03e-02
      -2.35e-02
      6.21e-03
      0.05
      2.02e-01
      -0.02
      0.17
      -0.12
      -0.01
      -2.01e-03
      0.04
      -0.02
      1.11e-01
      0.06
      0.04
      -4.63e-02
      8.09e-03
      -0.07
      -0.09
      0.04
      1.58e-02
      -6.30e-03
      8.22e-02
      -1.50e-03
      0.11
      -4.07e-02
      -9.42e-02
      -6.63e-03
      -2.56e-02
      -7.05e-02
      0.05
      1.81e-02
      -0.02
      0.03
      -3.41e-02
      7.00e-02
      2.24e-03
      -0.03
      0.10
      1.03e-01
      4.00e-03
      0.03
      -1.28e-02
      2.59e-02
      0.06
      -0.04
      3.56e-02
      0.03
      0.02
      0.03
      0.05
      3.59e-02
      1.32e-02
      2.59e-02
      3.40e-02
      -2.45e-02
      0.05
      2.24e-02
      0.02
      -0.04
      0.03
      0.06
    
  

183070 rows × 100 columns

What's with the 1.43?
Aren't they normalize?
... Nope



In [12]:

    
scatmat(df[df.columns[:5]][::100])









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-8e825409363d> in <module>()
----> 1 scatmat(df[df.columns[:5]][::100])

NameError: name 'df' is not defined



In [ ]:

    
num



In [ ]:

    
with gzip.open(os.path.join(DATA_PATH, 'tweet_topic_vectors.csv.gz'), 'wb') as f:
    df.to_csv(f, encoding='utf8', quoting=pd.io.common.csv.QUOTE_NONNUMERIC)

We built LSI topic vectors for 200k tweets in a few minutes!
Lets look at the TFIDF vectors for the top 6 tweets



In [10]:

    
tfidf6 = pd.DataFrame((dict([(vocab[i], freq) for i, freq in tfidf[bows[j]]]) for j in range(6)))
tfidf6 = tfidf6.fillna('')
tfidf6









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-c14a65c955cc> in <module>()
----> 1 tfidf6 = pd.DataFrame((dict([(vocab[i], freq) for i, freq in tfidf[bows[j]]]) for j in range(6)))
      2 tfidf6 = tfidf6.fillna('')
      3 tfidf6

/home/hobs/.virtualenvs/twip/local/lib/python2.7/site-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
    256         elif isinstance(data, (list, types.GeneratorType)):
    257             if isinstance(data, types.GeneratorType):
--> 258                 data = list(data)
    259             if len(data) > 0:
    260                 if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:

<ipython-input-10-c14a65c955cc> in <genexpr>((j,))
----> 1 tfidf6 = pd.DataFrame((dict([(vocab[i], freq) for i, freq in tfidf[bows[j]]]) for j in range(6)))
      2 tfidf6 = tfidf6.fillna('')
      3 tfidf6

NameError: global name 'tfidf' is not defined

Notice the small weights on the word "Python"? Why do you think that is? (Think back to the definition of TF and DF and TFIDF

Now lets see how far apart they are based only on word frequency (TFIDF) We'll "project" the first tweet onto the second with a dot product
to see how much of a "shadow" they make on each other



In [ ]:

    
tfidf6 = pd.DataFrame((dict([(vocab[i], freq) for i, freq in tfidf[bows[j]]]) for j in range(6))).fillna(0).T



In [ ]:

    
np.dot(tfidf6[0], tfidf6[1])



In [ ]:

    
np.dot(tfidf6[1], tfidf6[2])

That looks about right.
The first 2 share no words.
The second 2 share only "Python".
But lets do the cosine similarity correctly by normalizing for length.



In [ ]:

    
np.dot(tfidf6[1], tfidf6[2]) / np.linalg.norm(tfidf6[1]) / np.linalg.norm(tfidf6[2])

Hmmm, nothing changed
Can you guess why?



In [ ]:

    
[round(np.dot(tfidf6[i], tfidf6[i+1]), 4) for i in range(5)]



In [ ]:

    
Now lets look at the topic vectors.



In [125]:

    
df.iloc[:6]



In [122]:

    
print([round(np.dot(df.T[i], df.T[i+1]), 4) for i in range(5)])









    



[0.0105, 1.1037, 0.9981, 6.452, 1.0153]

Better normalize these...



In [123]:

    
print([round(np.dot(df.T[i], df.T[i+1]) / np.linalg.norm(df.T[i]) / np.linalg.norm(df.T[i+1]), 4) for i in range(5)])
# for comparison the TFIDF scores right below
print([round(np.dot(tfidf6[i], tfidf6[i+1]), 4) for i in range(5)])









    



[0.0066, 0.5742, 0.1849, 0.6925, 0.325]
[0.0, 0.001, 0.0009, 0.1673, 0.0005]

So the really chummy neighbors are 1 & 2 and 3 & 4
Surprisingly 2 & 3 didn't hit it off, and no pairing got a zero!
And the last 2 seem to share a "latent" similarity that TFIDF missed entirely!!! And LSI picked up on the python<->Python similarity (tweets 0 and 1)



In [133]:

    
with gzip.open(os.path.join(DATA_PATH, 'text.csv.gz'), 'rb') as f:
    text = pd.DataFrame.from_csv(f, encoding='utf8')



In [188]:

    
for toks, twt in zip(text.txt.iloc[:6], text.text.iloc[:6]):
    print(toks)
    print(twt)
    print('-' * 10)









    



 python never stop learning what you enjoy doing 
#python never stop learning what you enjoy doing. https://t.co/IH5ZSKnU8K
----------
Watching Boa vs Python 
Watching Boa vs. Python — https://t.co/Pivpk02s2A
----------
Monty Python The silly walk via YouTube
Monty Python - The silly walk https://t.co/C0Ja8UHL4t via @YouTube
----------
Senior Software Engineer Full Stack Python Django And Php Jobs jobs jobsearch 
Senior Software Engineer Full Stack Python Django And Php Jobs #jobs #jobsearch https://t.co/EuO3Et4JIT
----------
Architect Django Solr Platform Engineer With Python k Jobs in Manhattan NY Manhattan NY jobs jobsearch 
Architect Django Solr Platform Engineer With Python 230k Jobs in Manhattan, NY #Manhattan #NY #jobs #jobsearch https://t.co/ge0RzBDoSP
----------
peaceful rain Python inevitability
peaceful rain? Python - inevitability
----------

What about a new tweet you are considering?
Notice how I changed the token spelling (BOW),
but not the "semantics" of the tweet.



In [169]:

    
tweet = 'I want to help build django with a job in Chicago'
tweet_bow = vocab.doc2bow(tweet.split())
tweet_tfidf = tfidf[tweet_bow]
tweet_topics = pd.Series(dict(lsi[tweet_tfidf]))
# Now that the math is done let's convert to a friendlier format with words as the keys/index
tweet_tfidf = pd.Series(dict([(vocab[i], x) for (i, x) in tweet_tfidf])) 
print('\nLSI Topic Vector')
tweet_topics









    



LSI Topic Vector






    Out[169]:





0     1.41e-01
1     1.18e-01
2     1.45e-01
        ...   
97   -8.82e-03
98    2.28e-02
99   -5.27e-03
dtype: float64

Compare the topic vector above to the TFIDF vector below.
What's better about TFIDF compared to topic vectors?
What can we do about it?



In [170]:

    
print('TFIDF Frequency Vector')
print(tweet_tfidf)









    



TFIDF Frequency Vector
Chicago    0.45
I          0.18
a          0.13
           ... 
to         0.12
want       0.37
with       0.16
dtype: float64

Which one is it closest too?
Can you guess?
Does LSI understand the words as well as you do?



In [167]:

    
print('LSI Topic Similarity')
print([round(np.dot(df.T[i], tweet_topics) / np.linalg.norm(df.T[i]) / np.linalg.norm(tweet_topics), 4) for i in range(6)])









    



LSI Topic Similarity
[0.0716, -0.014, 0.0025, 0.0716, 0.1484, -0.003]



In [184]:

    
tfidf7 = tfidf6.copy()
tfidf7[6] = tweet_tfidf
tfidf7 = tfidf7.fillna(0)
tfidf7









    Out[184]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
    
  
  
    
      And
      0.00
      0.00
      0.00
      0.35
      0.00
      0.0
      0.0
    
    
      Architect
      0.00
      0.00
      0.00
      0.00
      0.25
      0.0
      0.0
    
    
      Boa
      0.00
      0.62
      0.00
      0.00
      0.00
      0.0
      0.0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      walk
      0.00
      0.00
      0.46
      0.00
      0.00
      0.0
      0.0
    
    
      what
      0.32
      0.00
      0.00
      0.00
      0.00
      0.0
      0.0
    
    
      you
      0.20
      0.00
      0.00
      0.00
      0.00
      0.0
      0.0
    
  

40 rows × 7 columns



In [ ]:



In [186]:

    
print([round(np.dot(tfidf7[i], tfidf7[6]), 4) for i in range(6)])









    



[0.0, 0.0, 0.0, 0.0, 0.0076, 0.0]



In [187]:

    
tweet









    Out[187]:





'I want to help build django with a job in Chicago'

Can you find the one word I accidentally share with the other tweets?
Hint: use the TFIDF matrix (Dataframe)
Play around with the tweet text to make its topic vector more "orthogonal"
Or make it closer in cosine distance.

	0	1	2	3	4	favorites
0	0.279246	0.644025	-0.159041	-0.259208	-0.128964	0
1	0.664247	-0.578260	0.144901	0.044227	-0.093921	0
2	0.728053	-0.587551	0.142273	0.075186	-0.129895	1
3	0.874018	-1.025440	0.506672	-0.843611	-0.181155	0
4	1.082223	-0.939508	0.617143	-1.436531	-0.077277	0
5	0.657642	-0.572345	0.143017	0.041250	-0.089767	0
...	...	...	...	...	...	...
183064	0.869770	0.011683	-0.046547	-0.247101	-0.212515	0
183065	0.702997	-0.564605	0.146534	0.052112	-0.100772	0
183066	0.670191	-0.580436	0.146313	0.042536	-0.096954	0
183067	1.427827	0.111896	-0.859894	-0.095688	-0.424858	0
183068	0.664247	-0.578260	0.144901	0.044227	-0.093921	0
183069	0.681229	-0.602672	0.166678	-0.015207	-0.101250	0

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48	49	50	51	52	53	54	55	56	57	58	59	60	61	62	63	64	65	66	67	68	69	70	71	72	73	74	75	76	77	78	79	80	81	82	83	84	85	86	87	88	89	90	91	92	93	94	95	96	97	98	99	favorites
0	0.280172	0.645190	-0.157156	-0.327782	0.015415	-0.014999	0.483952	-0.154137	0.240799	0.063167	0.119363	0.094626	-0.135290	0.122140	0.193363	0.002697	0.068299	0.025904	0.248716	0.692007	0.423768	-0.102020	-0.349437	0.054404	0.103062	-0.033871	-0.016174	0.010853	0.084489	-0.128339	-0.136022	-0.004152	0.094123	0.015972	0.110873	0.125792	-0.010306	-0.002119	-0.114695	-0.026637	-0.013192	0.052914	-0.041225	-0.050741	0.061092	-0.030267	0.034378	-0.012754	0.018169	0.057731	0.104373	0.007469	-0.049100	-0.004169	0.042586	-0.058043	-0.092664	0.076392	0.074233	-0.018681	0.089404	0.009699	0.040680	-0.049278	-0.019616	0.029592	0.018924	0.022357	0.051319	0.046523	0.030449	0.067461	-0.001596	0.007642	-0.013412	-0.031710	0.000384	0.023221	-0.006413	0.108972	0.104001	0.048431	0.049651	0.008886	-0.010492	-0.141836	0.097200	0.085994	0.021212	-0.094674	0.046784	0.046888	0.148451	-0.007399	0.185036	-0.088651	0.088479	0.049042	0.070358	0.071591	0
1	0.663715	-0.580478	0.148490	0.057189	-0.037231	-0.000197	0.173045	-0.127133	0.055935	-0.015247	0.250299	0.067773	-0.171196	0.010900	0.061693	-0.080983	-0.082421	-0.006501	-0.091766	0.032953	0.072217	-0.033408	-0.046145	0.024451	-0.081986	-0.012791	0.026987	-0.022037	-0.036708	0.000637	0.033873	0.054592	-0.013473	-0.014232	-0.031760	0.047070	0.018214	0.012422	-0.009772	-0.000617	0.007393	0.040041	0.044413	-0.079550	0.029095	-0.022515	0.034118	-0.033704	0.028967	0.022471	0.021409	0.004848	-0.049721	0.026330	-0.019879	-0.077067	-0.019985	0.027985	0.009797	0.052440	0.001960	0.048176	-0.084344	-0.019261	-0.049036	-0.052190	0.014431	-0.032369	-0.011042	-0.014681	-0.069244	0.005356	-0.077368	-0.027810	-0.004997	-0.050013	-0.084956	-0.020075	0.050919	0.059778	-0.013075	0.030441	-0.037697	-0.072826	0.141673	0.005741	0.029627	-0.069367	0.012395	-0.030158	0.044979	-0.000945	0.076312	0.032291	0.039446	-0.035483	-0.016841	0.081544	0.029812	-0.007467	0
2	0.727361	-0.587704	0.145206	0.082223	-0.057350	-0.011299	0.201314	-0.158187	0.060609	-0.023160	0.338061	0.064668	-0.240718	0.005614	0.114138	-0.036048	-0.087398	0.068509	-0.165800	-0.051961	0.065984	-0.053798	-0.043702	0.243280	-0.088352	-0.027219	-0.030730	-0.013177	0.175770	-0.378035	-0.005225	-0.230089	0.104294	0.350606	-0.182940	-0.448534	-0.056096	0.004929	-0.154000	0.014849	0.097148	0.042973	0.008829	-0.396335	-0.167250	0.156450	0.107210	-0.517160	-0.631082	-0.392434	0.015933	-0.078136	-0.112470	-0.055096	-0.102148	0.079702	-0.023785	0.069265	0.198604	-0.004287	-0.212030	0.039328	0.258817	-0.003450	0.002982	-0.180837	-0.057673	0.110186	-0.120306	0.085802	-0.039256	-0.014410	-0.151422	-0.006477	0.009820	-0.011455	-0.124494	0.048210	-0.022603	-0.039890	0.011741	-0.015465	-0.014987	-0.050109	0.002109	-0.000518	-0.077904	-0.038942	0.002526	0.076421	-0.047575	-0.046619	-0.026164	0.008935	-0.062166	0.028394	0.010168	0.106901	-0.012600	0.119668	1
3	0.875706	-1.042393	0.488922	-1.032760	-0.026749	-0.088396	0.359527	-0.506103	0.808462	-0.033757	-1.072789	-0.704033	0.222333	-0.100345	-0.106241	0.182381	0.086618	0.027355	-0.282973	0.051965	0.003404	0.093949	-0.116667	-0.187203	0.133163	0.054593	0.204164	-0.564098	-0.031973	-0.128866	0.032662	-0.094287	0.083921	-0.174592	-0.004803	-0.083423	0.015334	-0.006701	-0.073123	-0.105053	-0.037366	-0.082046	0.211543	-0.063374	0.235030	-0.176463	0.002135	0.024126	-0.192945	-0.104253	-0.159646	0.417474	0.080754	-0.158687	0.045846	0.128120	0.196454	0.031857	-0.020152	-0.278651	-0.204720	0.048018	-0.088910	0.291889	-0.622359	0.045088	0.492575	0.147261	0.620369	0.053716	0.232457	-0.304015	-0.121995	0.187564	0.022024	0.296733	0.182747	-0.053161	0.038762	0.102695	0.109263	0.069359	0.103465	-0.002159	0.155772	-0.198050	-0.184370	-0.082029	-0.059093	0.122648	-0.004657	0.012845	-0.171109	-0.065439	-0.032612	0.078941	0.022546	-0.011703	0.002504	-0.064791	0
4	1.085688	-0.960564	0.554194	-1.653842	0.096192	-0.091327	-0.031368	-0.076306	0.324141	-0.018115	-0.781173	-0.529770	0.241301	-0.179209	-0.107388	0.257473	0.177277	0.164085	-0.302419	0.071570	0.038746	0.027558	0.012607	-0.062721	-0.070035	0.029534	0.015812	0.177176	0.022005	0.044712	-0.146183	0.139087	0.700109	-0.581963	-0.143590	-0.468450	0.133108	-0.111055	0.041903	-0.050167	-0.042156	0.033517	-0.291797	-0.288576	-0.137058	0.003325	0.066265	0.006379	0.131540	-0.017310	0.171023	0.376294	-0.065476	-0.041911	0.084034	0.002309	0.012166	0.027025	-0.088878	0.016200	-0.255696	0.080371	0.030683	0.165323	-0.149626	0.085858	0.327007	0.036865	0.498190	-0.098803	0.129879	-0.475497	-0.264184	0.363755	0.088963	-0.049962	0.054467	0.142719	-0.235419	-0.083084	-0.097935	-0.120792	0.075250	0.080061	-0.277833	0.512237	0.377193	0.018365	0.274480	-0.194636	-0.054242	-0.071957	-0.021973	0.254511	0.055936	0.048872	0.029358	0.105685	0.110517	0.105955	0
5	0.657078	-0.574435	0.146859	0.053880	-0.033924	-0.001251	0.167541	-0.122287	0.057057	-0.017873	0.240668	0.065817	-0.162353	0.009816	0.059710	-0.073933	-0.077225	-0.009345	-0.083304	0.031927	0.069392	-0.030891	-0.043517	0.010201	-0.079881	-0.008722	0.029346	-0.017065	-0.029565	-0.011183	0.036699	0.053240	-0.012904	-0.018570	-0.037655	0.043325	0.029548	0.002025	0.003941	-0.003104	-0.000948	0.024912	0.024587	-0.047901	0.029155	-0.041217	0.018130	-0.013738	0.064594	0.025579	0.015837	0.006048	-0.030821	0.024971	0.023190	-0.008638	-0.012581	0.009834	-0.007077	-0.004522	0.016982	-0.004209	0.004786	0.007768	-0.000191	0.006214	0.015738	-0.008915	-0.004759	0.018011	-0.003371	0.009383	0.022910	-0.000116	-0.005423	-0.007377	-0.004678	0.008182	-0.015476	0.010809	0.008261	-0.007873	-0.013895	-0.009209	0.001693	-0.006578	0.011219	0.018944	0.013345	-0.014912	0.001883	0.001696	0.004464	0.005948	0.010276	-0.017412	0.007222	-0.006720	-0.000853	0.011416	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
183064	0.870214	0.010245	-0.039343	-0.307574	-0.027283	-0.043868	0.649458	-0.304471	0.293735	-0.020160	0.415650	0.128164	-0.308508	0.060289	0.091918	-0.178705	-0.102736	0.027722	-0.120563	0.071041	0.148020	-0.045126	-0.050886	0.088566	-0.139642	-0.032411	0.088933	-0.015483	-0.041371	-0.059783	0.077925	0.095152	-0.027145	-0.034247	-0.059308	0.086783	0.088112	0.025374	0.011310	0.029846	0.053767	0.000593	0.047426	-0.083587	0.066583	-0.043341	0.040275	-0.018947	0.118815	0.053220	0.056294	0.014738	-0.073742	0.047790	0.035142	-0.021582	-0.021536	-0.024907	-0.023140	0.045803	0.019833	-0.006597	-0.052378	0.088620	-0.037757	-0.008986	-0.012068	-0.068646	0.078612	0.068233	-0.105906	0.084157	-0.056192	-0.052821	-0.019871	0.023396	-0.029417	0.042341	-0.038953	0.141016	-0.097466	-0.110872	-0.052821	0.231681	-0.070912	-0.127818	-0.003268	0.068127	-0.100572	0.138942	-0.161589	-0.106263	-0.097585	0.144002	0.081436	-0.052898	-0.051156	0.087084	-0.072030	0.032930	0
183065	0.702398	-0.566650	0.147322	0.059966	-0.046549	0.034622	0.168766	-0.144736	0.052148	-0.014557	0.256829	0.036565	-0.175150	-0.006182	0.104387	-0.085452	-0.059369	0.000413	-0.044223	0.030992	0.093358	-0.026165	-0.067459	-0.004504	-0.170167	0.053224	0.114394	0.130597	-0.265979	-0.103368	0.426941	-0.462831	-0.162974	-0.242273	0.412028	-0.037502	0.084377	-0.088965	0.034608	-0.245478	-0.073023	0.113115	-0.020574	-0.097963	-0.077986	0.042122	0.017682	-0.019187	0.062740	-0.009568	0.070867	0.004646	-0.068482	0.056238	-0.072173	-0.087328	-0.044252	0.031166	0.006190	0.052294	-0.001141	0.070130	-0.085887	0.012772	-0.041610	-0.083715	-0.052863	0.012817	-0.015115	-0.005299	-0.081763	0.000014	-0.057155	-0.042348	-0.016748	-0.041563	-0.086302	-0.040241	0.054473	0.054835	-0.009244	0.043136	-0.026399	-0.094882	0.154029	0.017260	0.040346	-0.081129	0.020335	-0.029239	0.051447	-0.008598	0.077724	0.041206	0.022885	-0.040811	-0.009778	0.094098	0.047725	-0.018049	0
183066	0.669663	-0.582709	0.149469	0.053910	-0.036853	0.000214	0.174244	-0.123272	0.061204	-0.019881	0.248949	0.068679	-0.162073	0.010776	0.062365	-0.076171	-0.080251	-0.005891	-0.089111	0.036360	0.075552	-0.033255	-0.051043	0.023449	-0.086034	-0.019569	0.025187	-0.029774	-0.039552	-0.007480	0.037181	0.054541	-0.011960	-0.006722	-0.034009	0.041905	0.010608	0.016141	-0.007724	0.001931	0.009560	0.042889	0.048000	-0.082342	0.036323	-0.015598	0.033799	-0.028567	0.027999	0.020692	0.026421	-0.002638	-0.055783	0.037341	-0.014573	-0.087102	-0.022718	0.017238	0.000539	0.060406	-0.007583	0.064161	-0.110648	-0.023333	-0.056933	-0.069073	0.013472	-0.029940	-0.029544	-0.022402	-0.076554	0.011847	-0.092621	-0.047525	0.007550	-0.047268	-0.104397	-0.016156	0.030653	0.075597	-0.035328	0.028252	-0.031324	-0.096606	0.161888	0.032422	0.061476	-0.077350	0.029908	-0.055633	0.054607	-0.009457	0.054790	-0.010603	0.050136	-0.017191	-0.039859	0.072055	0.036292	0.017400	0
183067	1.428745	0.121274	-0.840299	-0.152442	-0.252583	-0.157892	0.270245	-0.584348	0.480387	-0.182970	1.240259	-0.634805	1.154241	0.494578	-0.622291	0.137708	0.125598	-0.191302	-0.049397	0.051531	-0.015288	-0.224928	-0.205462	0.031812	-0.083537	-0.182684	0.089219	0.035190	-0.058811	-0.064603	0.077887	0.147615	0.014923	0.063822	-0.040497	0.105330	0.043142	0.023293	-0.064266	-0.101640	0.084103	-0.050603	0.002113	-0.158812	0.049550	0.008803	0.089982	0.004476	0.094374	0.062600	-0.003256	-0.015440	0.032174	0.013549	0.079834	-0.044131	-0.028855	0.081283	-0.013127	-0.054737	-0.025943	0.008279	0.006034	0.019395	-0.051227	-0.021100	0.039805	0.015640	-0.069415	-0.074022	-0.047591	0.007868	0.051574	0.026596	-0.057869	0.051712	-0.015805	0.036133	0.037438	0.093048	0.014121	0.012448	0.001563	0.060706	0.018820	0.064614	0.074386	-0.028317	0.023688	-0.012575	-0.020834	-0.077714	0.035732	-0.015738	0.028247	-0.006147	0.008858	-0.024482	-0.083301	0.008295	0
183068	0.663715	-0.580478	0.148490	0.057189	-0.037231	-0.000197	0.173045	-0.127133	0.055935	-0.015247	0.250299	0.067773	-0.171196	0.010900	0.061693	-0.080983	-0.082421	-0.006501	-0.091766	0.032953	0.072217	-0.033408	-0.046145	0.024451	-0.081986	-0.012791	0.026987	-0.022037	-0.036708	0.000637	0.033873	0.054592	-0.013473	-0.014232	-0.031760	0.047070	0.018214	0.012422	-0.009772	-0.000617	0.007393	0.040041	0.044413	-0.079550	0.029095	-0.022515	0.034118	-0.033704	0.028967	0.022471	0.021409	0.004848	-0.049721	0.026330	-0.019879	-0.077067	-0.019985	0.027985	0.009797	0.052440	0.001960	0.048176	-0.084344	-0.019261	-0.049036	-0.052190	0.014431	-0.032369	-0.011042	-0.014681	-0.069244	0.005356	-0.077368	-0.027810	-0.004997	-0.050013	-0.084956	-0.020075	0.050919	0.059778	-0.013075	0.030441	-0.037697	-0.072826	0.141673	0.005741	0.029627	-0.069367	0.012395	-0.030158	0.044979	-0.000945	0.076312	0.032291	0.039446	-0.035483	-0.016841	0.081544	0.029812	-0.007467	0
183069	0.680829	-0.605563	0.168112	-0.019684	-0.035082	-0.004944	0.187525	-0.152053	0.104167	-0.019865	0.158488	0.022164	-0.132383	0.004857	0.056615	-0.064475	-0.086735	-0.010399	-0.027155	0.022303	0.063060	-0.026450	-0.045680	-0.005677	-0.074802	-0.012505	0.043404	-0.069921	-0.035012	-0.035942	0.063313	0.005194	-0.018277	-0.045093	-0.016679	0.045723	-0.021070	0.021675	-0.048409	-0.017206	-0.010542	-0.032385	0.210582	0.012948	0.167391	-0.113386	-0.022223	-0.018279	-0.037268	0.047014	-0.120914	-0.014670	0.039569	-0.045413	-0.003052	0.078268	0.092287	-0.040370	-0.007169	-0.000527	0.072509	-0.022569	0.105022	0.025033	0.092175	-0.003359	-0.021866	-0.028068	-0.073059	-0.020781	-0.033068	-0.002902	-0.024298	-0.069612	-0.009512	-0.009712	-0.037209	0.110440	0.010992	0.019845	0.034989	0.015729	0.028723	-0.022046	0.016285	0.113497	0.018043	-0.013633	0.056959	-0.005187	0.010248	0.003958	0.001884	0.033301	0.073964	-0.007646	-0.040707	-0.010739	0.061213	0.036570	0

topic	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48	49	50	51	52	53	54	55	56	57	58	59	60	61	62	63	64	65	66	67	68	69	70	71	72	73	74	75	76	77	78	79	80	81	82	83	84	85	86	87	88	89	90	91	92	93	94	95	96	97	98	99
tweet
0	0.28	0.65	-0.16	-0.33	0.02	-1.45e-02	0.48	-0.15	0.24	0.06	-0.12	-0.10	-0.14	1.24e-01	0.19	2.88e-03	0.07	-2.61e-02	0.25	0.69	4.21e-01	0.10	0.35	-0.06	0.10	-3.42e-02	-0.02	0.01	-0.08	-1.27e-01	-0.14	-1.18e-03	0.09	0.01	1.14e-01	0.12	0.01	3.88e-03	1.14e-01	-2.34e-02	8.72e-03	-0.06	-0.03	0.06	0.06	-0.03	0.04	7.79e-03	-0.02	-0.06	-9.21e-02	-0.04	-0.05	-2.77e-03	-0.05	4.81e-02	1.00e-01	-6.77e-02	-5.99e-02	-4.41e-02	9.86e-02	-7.40e-03	2.94e-02	6.66e-02	-5.87e-03	5.12e-02	3.48e-03	4.41e-02	-3.36e-02	-2.98e-02	-4.79e-02	-4.54e-02	7.93e-03	-3.02e-02	2.57e-02	-5.87e-02	0.03	-5.06e-03	-2.56e-02	-2.05e-02	-9.39e-02	3.91e-02	1.29e-01	-0.02	-6.72e-02	-0.13	0.05	-0.09	-4.17e-02	-5.32e-02	-4.56e-02	8.33e-03	2.59e-02	1.35e-01	1.52e-01	1.73e-01	-8.84e-02	6.54e-02	5.41e-02	2.94e-02
1	0.66	-0.58	0.15	0.06	-0.04	1.83e-04	0.17	-0.13	0.06	-0.02	-0.25	-0.07	-0.17	1.07e-02	0.06	-8.09e-02	-0.08	5.90e-03	-0.09	0.03	7.28e-02	0.03	0.05	-0.02	-0.08	-1.30e-02	0.03	-0.02	0.04	-5.06e-04	0.04	5.39e-02	-0.01	-0.01	-3.26e-02	0.05	-0.02	-1.54e-02	8.40e-03	1.47e-03	-1.06e-02	-0.04	0.05	0.08	0.03	-0.02	0.05	-2.03e-02	-0.03	-0.03	-1.89e-02	-0.01	-0.05	2.78e-02	0.02	7.46e-02	3.24e-02	-3.00e-02	-8.60e-03	5.39e-02	3.34e-03	-2.85e-02	-8.26e-02	1.60e-02	6.41e-02	-5.40e-02	2.66e-02	-7.94e-03	-2.88e-02	6.12e-04	-5.04e-02	4.97e-02	-6.44e-02	1.57e-02	-2.44e-02	-3.96e-02	0.10	-3.95e-02	-1.41e-01	6.43e-02	4.19e-02	1.92e-02	-7.03e-02	0.02	-3.87e-03	0.09	-0.02	0.05	1.81e-02	-2.28e-02	-1.53e-03	1.40e-02	1.16e-02	-8.98e-03	9.51e-02	2.44e-03	5.53e-02	3.33e-02	-1.42e-02	4.34e-02
2	0.73	-0.59	0.15	0.08	-0.06	-1.08e-02	0.20	-0.16	0.06	-0.02	-0.34	-0.07	-0.24	5.88e-03	0.11	-3.67e-02	-0.09	-6.82e-02	-0.17	-0.05	6.64e-02	0.05	0.04	-0.24	-0.09	-2.62e-02	-0.03	-0.01	-0.18	-3.84e-01	-0.02	-2.27e-01	0.10	0.35	-1.86e-01	-0.45	0.05	-1.09e-02	1.48e-01	3.17e-02	-9.25e-02	-0.02	0.03	0.38	-0.14	0.13	0.30	-4.44e-01	0.62	0.42	-8.57e-02	0.09	-0.12	-4.98e-02	0.10	-8.21e-02	2.42e-02	-1.02e-01	-1.96e-01	-2.29e-02	-1.88e-01	-1.11e-01	2.43e-01	7.44e-03	-3.80e-02	-1.81e-01	-2.32e-02	2.96e-02	1.55e-01	-8.53e-02	-4.48e-02	2.22e-02	-1.66e-01	9.72e-03	-1.06e-02	1.15e-02	0.13	2.63e-02	3.79e-02	2.46e-02	-6.93e-03	-5.94e-03	-5.48e-02	0.05	6.27e-02	0.02	-0.04	0.03	5.61e-02	1.42e-02	-2.90e-03	2.73e-02	-4.46e-02	-3.45e-02	3.19e-02	-5.64e-02	4.14e-02	7.66e-02	1.04e-02	4.29e-02
3	0.88	-1.04	0.49	-1.03	-0.03	-8.89e-02	0.36	-0.51	0.81	-0.04	1.07	0.71	0.22	-1.01e-01	-0.11	1.84e-01	0.09	-2.75e-02	-0.28	0.05	3.24e-03	-0.09	0.12	0.19	0.13	5.21e-02	0.20	-0.56	0.04	-1.32e-01	0.03	-9.20e-02	0.08	-0.17	-9.26e-03	-0.08	-0.02	-4.99e-03	7.07e-02	-1.06e-01	3.11e-02	0.10	0.21	0.06	0.25	-0.17	0.01	3.35e-02	0.20	0.12	2.74e-01	-0.35	0.06	-1.69e-01	-0.05	-8.05e-02	-2.04e-01	-1.39e-02	7.50e-03	-2.74e-01	-1.88e-01	-8.69e-02	-4.61e-02	-2.51e-01	6.13e-01	-2.88e-02	4.39e-01	4.34e-01	-4.63e-01	-3.64e-02	4.16e-01	8.83e-03	-8.95e-02	-1.70e-01	3.37e-03	2.22e-01	-0.26	-3.93e-02	-2.02e-01	4.66e-02	-7.28e-02	6.96e-03	1.46e-02	0.14	-1.74e-01	0.02	-0.17	0.04	4.77e-02	8.11e-03	2.91e-02	-8.06e-02	5.02e-02	-9.60e-02	-1.62e-01	-1.09e-01	-9.73e-02	-5.02e-03	2.63e-04	-3.74e-02
4	1.09	-0.96	0.55	-1.65	0.09	-9.28e-02	-0.03	-0.08	0.32	-0.02	0.78	0.53	0.24	-1.81e-01	-0.11	2.60e-01	0.18	-1.65e-01	-0.30	0.07	3.87e-02	-0.03	-0.01	0.06	-0.07	2.82e-02	0.02	0.18	-0.02	4.75e-02	-0.14	1.36e-01	0.70	-0.59	-1.50e-01	-0.46	-0.13	1.14e-01	-4.24e-02	-4.16e-02	3.93e-02	-0.05	-0.27	0.31	-0.14	0.03	0.06	1.45e-02	-0.11	-0.02	-3.50e-02	-0.41	-0.08	-6.46e-02	-0.10	7.97e-03	-5.98e-03	-7.08e-03	8.21e-02	2.32e-02	-2.31e-01	-1.40e-01	1.16e-02	-1.42e-01	1.74e-01	3.91e-02	2.54e-01	2.50e-01	-4.02e-01	8.41e-02	4.73e-01	1.72e-01	-2.84e-01	-3.59e-01	-5.01e-02	4.88e-02	0.01	2.04e-01	2.98e-01	-1.80e-01	-3.01e-02	4.62e-02	3.68e-01	-0.45	1.04e-01	-0.02	0.26	0.12	1.37e-01	2.04e-01	5.00e-02	1.84e-01	-5.38e-02	-2.14e-01	2.25e-01	-2.92e-02	3.84e-02	4.68e-02	-1.03e-02	1.66e-01
5	0.66	-0.57	0.15	0.05	-0.03	-8.56e-04	0.17	-0.12	0.06	-0.02	-0.24	-0.07	-0.16	1.00e-02	0.06	-7.42e-02	-0.08	9.28e-03	-0.08	0.03	6.94e-02	0.03	0.04	-0.01	-0.08	-9.25e-03	0.03	-0.02	0.03	-1.15e-02	0.04	5.27e-02	-0.01	-0.02	-3.73e-02	0.04	-0.03	-1.58e-03	-2.25e-03	-3.04e-03	-1.94e-03	-0.02	0.03	0.05	0.03	-0.04	0.02	-5.09e-03	-0.07	-0.03	-9.71e-03	-0.01	-0.03	2.68e-02	-0.02	7.11e-03	1.48e-02	-6.50e-03	7.57e-03	-5.94e-03	1.70e-02	2.76e-03	5.83e-03	-5.27e-03	-8.32e-04	8.53e-03	1.33e-02	-8.66e-03	-2.48e-03	-1.62e-02	-7.40e-03	-2.36e-03	2.33e-02	7.16e-04	7.38e-03	-7.23e-03	0.01	-8.54e-04	-7.70e-03	3.83e-03	-2.55e-04	1.90e-02	3.54e-03	-0.02	9.03e-04	-0.01	0.01	-0.02	-2.24e-03	5.41e-03	2.12e-03	2.69e-03	-2.47e-03	1.07e-03	5.21e-03	2.56e-02	-1.78e-03	-1.06e-03	-1.95e-03	4.43e-03

	0	1	2	3	4	5	6
And	0.00	0.00	0.00	0.35	0.00	0.0	0.0
Architect	0.00	0.00	0.00	0.00	0.25	0.0	0.0
Boa	0.00	0.62	0.00	0.00	0.00	0.0	0.0
...	...	...	...	...	...	...	...
walk	0.00	0.00	0.46	0.00	0.00	0.0	0.0
what	0.32	0.00	0.00	0.00	0.00	0.0	0.0
you	0.20	0.00	0.00	0.00	0.00	0.0	0.0