In [1]:
import graphlab as gl
In [2]:
!head -n 2 ../data/yelp/yelp_training_set_review.json
In [3]:
reviews = gl.SFrame.read_csv('../data/yelp/yelp_training_set_review.json', header=False)
reviews
Out[3]:
In [4]:
reviews[0]
Out[4]:
In [5]:
reviews=reviews.unpack('X1','')
reviews
Out[5]:
In [6]:
reviews = reviews.unpack('votes', '')
reviews
Out[6]:
In [7]:
reviews.show()
In [8]:
reviews['date'] = reviews['date'].str_to_datetime(str_format='%Y-%m-%d')
In [9]:
reviews['total_votes'] = reviews['funny'] + reviews['cool'] + reviews['useful']
reviews
Out[9]:
In [10]:
reviews['total_votes'] > 0
Out[10]:
In [11]:
reviews = reviews[reviews['total_votes'] > 0]
reviews
Out[11]:
In [12]:
reviews['funny'] = reviews['funny'] > 0
In [13]:
reviews = reviews[['text','funny']]
reviews
Out[13]:
In [14]:
reviews = reviews[:10000]
In [15]:
word_delims = ["\r", "\v", "\n", "\f", "\t", " ",
'~', '`', '!', '@', '#', '$', '%', '^', '&', '*', '-', '_', '+', '=',
',', '.', ';', ':', '\"', '?', '|', '\\', '/',
'<', '>', '(', ')', '[', ']', '{', '}']
reviews['bow'] = gl.text_analytics.count_words(reviews['text'], delimiters=word_delims)
In [16]:
reviews['tf_idf'] = gl.text_analytics.tf_idf(reviews['bow'])
In [17]:
reviews['tf_idf'] = reviews['tf_idf'].apply(lambda x: x['docs'])
In [18]:
reviews
Out[18]:
In [19]:
train_sf, test_sf = reviews.random_split(0.8)
In [20]:
m1 = gl.logistic_classifier.create(train_sf,
'funny',
features=['bow'],
validation_set=None,
feature_rescaling=False)
In [21]:
m2 = gl.logistic_classifier.create(train_sf,
'funny',
features=['tf_idf'],
validation_set=None,
feature_rescaling=False)
In [22]:
m1_res = m1.evaluate(test_sf)
m1_res
Out[22]:
In [23]:
m2_res = m2.evaluate(test_sf)
m2_res
Out[23]:
Percentage of 'funny' reviews:
In [25]:
float(test_sf['funny'].sum())/test_sf.num_rows()
Out[25]:
Percentage of not funny reviews:
In [26]:
1.0 - float(test_sf['funny'].sum())/test_sf.num_rows()
Out[26]:
In [ ]: