In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
## Processing function for removing smart quotes

def remove_smartquotes(frame):
    frame_ = frame.copy()
    frame_.loc[:,['message']] = frame_.message.apply(lambda x: x.replace("‘", "'"))
    frame_.loc[:,['message']] = frame_.message.apply(lambda x: x.replace("’", "'"))
    frame_.loc[:,['message']] = frame_.message.apply(lambda x: x.replace("“", "\""))
    frame_.loc[:,['message']] = frame_.message.apply(lambda x: x.replace("”", "\""))
    return frame_

In [3]:
## More processing code that will convert the slack channel csv into something usable

def process_slack_channel(filename):
    #Processing raw channel messages
    channel_frame = pd.read_csv(filename, header=None)
    print("total number of messages:", len(channel_frame))
    
    channel_frame.columns = ['year', 'channel_id', 'user_id', 'message', 'timestamp']
    channel_frame.message = channel_frame.message.apply(lambda x:x.strip('"'))

    #Throw out user action notifications, re-index
    channel_frame = channel_frame[channel_frame.message.apply(lambda x: re.match(r'\\u.+\|.+\\u003e', x) is None)]
    channel_frame.index = range(len(channel_frame))
    print("after throwing out channel events:", len(channel_frame))
    



    #Throw out empty lines, reindex
    channel_frame = channel_frame[channel_frame.message != '']
    channel_frame.index = range(len(channel_frame))

    channel_frame.message = channel_frame.message.apply(lambda x: x.lower())
    print("after throwing out empty messages:", len(channel_frame))
    
    #Replace users with |USER| tag
    channel_frame.message = channel_frame.message.apply(lambda x: re.sub(r"\\u003.+?\\u003e", "|USER|", x))
    
    #Convert timestamp to actual timestamp object
    
    channel_frame.timestamp = pd.to_datetime(channel_frame.timestamp, unit='s')

    
    return remove_smartquotes(channel_frame)

In [4]:
## Put the filename in the quotes below

support_messages = process_slack_channel('support_messages.csv')


total number of messages: 48558
after throwing out channel events: 45240
after throwing out empty messages: 45185

In [5]:
support_messages # uncomment to show messages


Out[5]:
year channel_id user_id message timestamp
0 2011 C0DUNQVPS U0BPUAG7N sounds good 2015-11-18 23:44:09.000008
1 2011 C0ELD9Y3G U0BTA8US0 fyi i just added a new knowledge article to sc... 2016-07-30 02:19:27.000002
2 2011 C0ELD9Y3G U02V8GURN perfect thank you. 2015-11-17 16:40:05.000004
3 2011 C0ELD9Y3G U0BSVEW4Q yes. they will be updating the queue spreadsh... 2015-11-17 16:39:47.000003
4 2011 C0ELD9Y3G U02V8GURN hi lisa - just needing a little clarification ... 2015-11-17 16:38:41.000002
5 2011 C0ELD9Y3G U0BSVEW4Q one note. for a few of the entries, you indic... 2015-11-17 15:05:54.000007
6 2011 C0ELD9Y3G U0BSVEW4Q thanks, amy. 2015-11-17 15:02:01.000006
7 2011 C0ELD9Y3G U0DN4DLH4 i can look at more tomorrow 2015-11-17 05:23:20.000005
8 2011 C0ELD9Y3G U0DN4DLH4 no different than they would need to do, i've... 2015-11-17 05:23:15.000004
9 2011 C0ELD9Y3G U0DN4DLH4 so i've been through lines 6-41, most were in... 2015-11-17 05:21:52.000003
10 2011 C0ELD9Y3G U0DN4DLH4 hey lisa, 2015-11-17 05:20:48.000002
11 2011 C0ELD9Y3G U0DN4DLH4 i can take a look as well a little later 2015-11-17 00:25:16.000012
12 2011 C0ELD9Y3G U0BSVEW4Q thanks. 2015-11-16 23:27:50.000011
13 2011 C0ELD9Y3G U02V8GURN yes i'll take a look 2015-11-16 23:27:34.000010
14 2011 C0ELD9Y3G U0BSVEW4Q there are lots of dealer lists but i'm hoping ... 2015-11-16 23:14:26.000009
15 2011 C0ELD9Y3G U0BSVEW4Q would you guys please go into kristin's spread... 2015-11-16 23:13:55.000008
16 2011 C0ELD9Y3G U0BSVEW4Q i've invited everyone i think 2015-11-16 22:25:37.000007
17 2011 C0DUNQVPS U08NM12HJ for some reason, i didn't see this reply until... 2016-02-29 21:24:36.000002
18 2011 C0DUNQVPS U02HWSVTK we will have to take a look 2016-02-27 00:17:02.000005
19 2011 C0DUNQVPS U02HWSVTK it looks like there is a massive dealer list f... 2016-02-27 00:16:53.000004
20 2011 C0DUNQVPS U08NM12HJ actually, it's moving now but, pretty slow... 2016-02-27 00:00:25.000003
21 2011 C0DUNQVPS U08NM12HJ |USER|: it doesn't look like the dealer list i... 2016-02-26 23:59:24.000002
22 2011 C0DUNQVPS U02HWSVTK are you still seeing the issue? 2016-01-29 22:13:56.000007
23 2011 C0DUNQVPS U02HWSVTK from what i can see (admittedly brief check) i... 2016-01-29 22:13:48.000006
24 2011 C0DUNQVPS U09QHAHDM thanks! 2016-01-29 18:26:17.000005
25 2011 C0DUNQVPS U02HWSVTK ill take a look after my lunch meeting with tom 2016-01-29 18:05:17.000004
26 2011 C0DUNQVPS U02HWSVTK k 2016-01-29 18:05:09.000003
27 2011 C0DUNQVPS U09QHAHDM |USER|: looks like the dealer list isn't initi... 2016-01-29 17:50:00.000002
28 2011 C0DUNQVPS U08NM12HJ that would be awesome! 2016-01-05 18:22:06.000007
29 2011 C0DUNQVPS U053LA028 do you want me to prepare a similar list for a... 2016-01-05 17:55:39.000006
... ... ... ... ... ...
45155 1014 C6KCNP2AD U08HELUHG just pushed a change so that now when a user t... 2017-08-26 20:57:50.000007
45156 1014 C6KCNP2AD USLACKBOT dashboard-issue task uploaded a file: https://... 2017-08-25 17:40:16.000272
45157 1014 C6KCNP2AD USLACKBOT dashboard-issue task uploaded a file: https://... 2017-08-22 23:43:11.000006
45158 1014 C6KCNP2AD USLACKBOT dashboard-issue task uploaded a file: https://... 2017-08-22 17:23:15.000544
45159 1014 C6KCNP2AD U35ALQ075 sounds good 2017-08-21 22:52:35.000181
45160 1014 C6KCNP2AD U087MUEAJ on second thought, rewarming agr usually takes... 2017-08-21 22:51:54.000221
45161 1014 C6KCNP2AD U35ALQ075 thanks! 2017-08-21 22:44:48.000016
45162 1014 C6KCNP2AD U35ALQ075 i'll just leave it out of the summary this week 2017-08-21 22:44:46.000141
45163 1014 C6KCNP2AD U35ALQ075 yeah no rush at all 2017-08-21 22:44:36.000246
45164 1014 C6KCNP2AD U087MUEAJ i can remove the spike and regenerate agr. |US... 2017-08-21 22:33:56.000178
45165 1014 C6KCNP2AD U1UTYACBS but we didn't see a spike in agr when the r sp... 2017-08-21 22:26:10.000342
45166 1014 C6KCNP2AD U08HELUHG as the spike leaves the window, agr plummets? 2017-08-21 22:19:09.000323
45167 1014 C6KCNP2AD U087MUEAJ hmm, why would a spike in r drive down agr? 2017-08-21 22:17:54.000233
45168 1014 C6KCNP2AD U087MUEAJ oh whoops thought that this was data-team 2017-08-21 22:17:35.000210
45169 1014 C6KCNP2AD U087MUEAJ hwanseung is an engineer based in singapore(?)... 2017-08-21 22:16:53.000102
45170 1014 C6KCNP2AD U35ALQ075 and curious if folks here have thoughts on bet... 2017-08-21 22:14:00.000179
45171 1014 C6KCNP2AD U35ALQ075 who is/was hwanseung? :slightly_smiling_face: 2017-08-21 22:12:55.000201
45172 1014 C6KCNP2AD U35ALQ075 --the dri for this is obviously not correct, r... 2017-08-21 22:12:38.000084
45173 1014 C6KCNP2AD U35ALQ075 (and is the bug in r still potentially driving... 2017-08-21 22:12:08.000190
45174 1014 C6KCNP2AD U35ALQ075 --does the 59% --\u003e 50% week over week cha... 2017-08-21 22:11:50.000045
45175 1014 C6KCNP2AD U35ALQ075 two questions (and let me know what i should j... 2017-08-21 22:11:26.000373
45176 1014 C6KCNP2AD U35ALQ075 hi dash friends 2017-08-21 22:11:02.000156
45177 1014 C0W2JJ2BC U6CCLEYA2 yeah, if the token contains a username, slackb... 2017-08-28 18:06:10.000093
45178 1014 C0W2JJ2BC U029BEHA9 thanks :slightly_smiling_face: 2017-08-28 18:04:23.000286
45179 1014 C0W2JJ2BC U44NURUQY it sends a message via slackbot (from personal... 2017-08-28 18:04:12.000028
45180 1014 C0W2JJ2BC U029BEHA9 |USER| does this also @mention the people whos... 2017-08-28 18:02:26.000199
45181 1014 C0W2JJ2BC U6CCLEYA2 the following lego sessions will be released a... 2017-08-22 22:48:49.000074
45182 1014 C0W2JJ2BC U029BEHA9 a very good one 2017-08-22 22:44:17.000358
45183 1014 C0W2JJ2BC U6CCLEYA2 ^that's a test 2017-08-22 22:42:41.000334
45184 1014 C0W2JJ2BC U6CCLEYA2 added an integration to this channel: https://... 2017-08-22 21:45:26.000323

45185 rows × 5 columns


In [8]:
## Here you need to generate your contexts.
## Depending on what your particular assignment is, you need to group the text in different ways.
## You should end up with a contexts that is a list of strings

contexts = ["how not to go", "Do not go!"]

#for i in [1]:
#   contexts.append(bar)

In [9]:
## In this cell you need to 
## A) initialize your vectorizer object with the settings you will want to use to make
## your bag of words vectors
## B) Fit your vectorizer and generate a matrix of vectors

vectorizer = TfidfVectorizer(use_idf=False)
tfidf_vectors = vectorizer.fit_transform(contexts)

In [ ]:


In [10]:
def get_similar_terms(term):
    term_vector = bow_vector_for_the_term
    
    cosine_similarities = cosine_similarities_between_term_and_all_other_terms
    
    
    ## This is a way of making sure the terms distances are sorted in the corect order, so we take the
    ## top 10.
    top_10 = np.flip(np.argsort(cosine_similarities, axis=0), axis=0).T[0][:10]
    results = []
    for i in top_10:
        results.append(vectorizer.get_feature_names()[i])
    return results

In [11]:
get_similar_terms('printer')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-b4122bc1ecc0> in <module>()
----> 1 get_similar_terms('printer')

<ipython-input-10-546778697e50> in get_similar_terms(term)
      1 def get_similar_terms(term):
----> 2     term_vector = bow_vector_for_the_term
      3 
      4     cosine_similarities = cosine_similarities_between_term_and_all_other_terms
      5 

NameError: name 'bow_vector_for_the_term' is not defined

In [ ]: