In [1]:
import graphlab as gl
In [2]:
train = gl.SFrame.read_csv("../data/train.csv")
In [3]:
test = gl.SFrame.read_csv("../data/test.csv")
In [4]:
desc = gl.SFrame.read_csv("../data/product_descriptions.csv")
In [5]:
# merge train with description
train = train.join(desc, on = 'product_uid', how = 'left')
In [6]:
# merge test with description
test = test.join(desc, on = 'product_uid', how = 'left')
Let's examine 3 different queries and products:
In [7]:
first_doc = train[0]
first_doc
Out[7]:
'angle bracket' search term is not contained in the body. 'angle' would be after stemming however 'bracket' is not.
In [8]:
middle_doc = train[37033]
middle_doc
Out[8]:
only 'wood' is present from search term
In [9]:
last_doc = train[-1]
last_doc
Out[9]:
'sheer' and 'courtain' are present and that's all
In [10]:
train['search_term_word_count'] = gl.text_analytics.count_words(train['search_term'])
ranked3doc = train[train['relevance'] == 3]
print ranked3doc.head()
len(ranked3doc)
Out[10]:
In [11]:
words_search = gl.text_analytics.tokenize(ranked3doc['search_term'], to_lower = True)
words_description = gl.text_analytics.tokenize(ranked3doc['product_description'], to_lower = True)
words_title = gl.text_analytics.tokenize(ranked3doc['product_title'], to_lower = True)
wordsdiff_desc = []
wordsdiff_title = []
puid = []
search_term = []
ws_count = []
ws_count_used_desc = []
ws_count_used_title = []
for item in xrange(len(ranked3doc)):
ws = words_search[item]
pd = words_description[item]
pt = words_title[item]
diff = set(ws) - set(pd)
if diff is None:
diff = 0
wordsdiff_desc.append(diff)
diff2 = set(ws) - set(pt)
if diff2 is None:
diff2 = 0
wordsdiff_title.append(diff2)
puid.append(ranked3doc[item]['product_uid'])
search_term.append(ranked3doc[item]['search_term'])
ws_count.append(len(ws))
ws_count_used_desc.append(len(ws) - len(diff))
ws_count_used_title.append(len(ws) - len(diff2))
differences = gl.SFrame({"puid" : puid,
"search term": search_term,
"diff desc" : wordsdiff_desc,
"diff title" : wordsdiff_title,
"ws count" : ws_count,
"ws count used desc" : ws_count_used_desc,
"ws count used title" : ws_count_used_title})
In [12]:
differences.sort(['ws count used desc', 'ws count used title'])
Out[12]:
In [13]:
print "No terms used in description : " + str(len(differences[differences['ws count used desc'] == 0]))
print "No terms used in title : " + str(len(differences[differences['ws count used title'] == 0]))
print "No terms used in description and title : " + str(len(differences[(differences['ws count used desc'] == 0) &
(differences['ws count used title'] == 0)]))
In [14]:
import matplotlib.pyplot as plt
%matplotlib inline
In [15]:
train_search_tfidf = gl.text_analytics.tf_idf(train['search_term_word_count'])
In [16]:
train['search_tfidf'] = train_search_tfidf
In [17]:
train['product_desc_word_count'] = gl.text_analytics.count_words(train['product_description'])
train_desc_tfidf = gl.text_analytics.tf_idf(train['product_desc_word_count'])
In [18]:
train['desc_tfidf'] = train_desc_tfidf
In [19]:
train['product_title_word_count'] = gl.text_analytics.count_words(train['product_title'])
train_title_tfidf = gl.text_analytics.tf_idf(train['product_title_word_count'])
train['title_tfidf'] = train_title_tfidf
In [20]:
train['distance'] = train.apply(lambda x: gl.distances.cosine(x['search_tfidf'],x['desc_tfidf']))
train['distance2'] = train.apply(lambda x: gl.distances.cosine(x['search_tfidf'],x['title_tfidf']))
In [21]:
model1 = gl.linear_regression.create(train, target = 'relevance', features = ['distance', 'distance2'], validation_set = None)
In [23]:
#let's take a look at the weights before we plot
model1.get("coefficients")
Out[23]:
In [25]:
test['search_term_word_count'] = gl.text_analytics.count_words(test['search_term'])
test_search_tfidf = gl.text_analytics.tf_idf(test['search_term_word_count'])
test['search_tfidf'] = test_search_tfidf
test['product_desc_word_count'] = gl.text_analytics.count_words(test['product_description'])
test_desc_tfidf = gl.text_analytics.tf_idf(test['product_desc_word_count'])
test['desc_tfidf'] = test_desc_tfidf
test['product_title_word_count'] = gl.text_analytics.count_words(test['product_title'])
test_title_tfidf = gl.text_analytics.tf_idf(test['product_title_word_count'])
test['title_tfidf'] = test_title_tfidf
test['distance'] = test.apply(lambda x: gl.distances.cosine(x['search_tfidf'],x['desc_tfidf']))
test['distance2'] = test.apply(lambda x: gl.distances.cosine(x['search_tfidf'],x['title_tfidf']))
In [27]:
'''
predictions_test = model1.predict(test)
test_errors = predictions_test - test['relevance']
RSS_test = sum(test_errors * test_errors)
print RSS_test
'''
Out[27]:
In [ ]:
output
In [ ]:
submission = gl.SFrame(test['id'])
In [ ]:
submission.add_column(output)
submission.rename({'X1': 'id', 'X2':'relevance'})
In [ ]:
submission['relevance'] = submission.apply(lambda x: 3.0 if x['relevance'] > 3.0 else x['relevance'])
submission['relevance'] = submission.apply(lambda x: 1.0 if x['relevance'] < 1.0 else x['relevance'])
In [ ]:
submission['relevance'] = submission.apply(lambda x: str(x['relevance']))
In [ ]:
submission.export_csv('../data/submission.csv', quote_level = 3)
In [ ]:
#gl.canvas.set_target('ipynb')