In [1]:
import numpy as np
import pandas as pd
import matplotlib
%matplotlib inline
In [2]:
reviews = pd.read_csv('data/raw_reviews.csv', header=0, usecols=['reting','date', 'comment'], encoding='utf-8')
In [3]:
reviews.head()
Out[3]:
In [4]:
reviews.shape
Out[4]:
In [5]:
reviews = reviews[~reviews.comment.duplicated()]
reviews.shape
Out[5]:
In [6]:
reviews.reting.value_counts()
Out[6]:
reviews[reviews.comment.apply(len) < 3000].comment.apply(len).hist(bins=50)
In [8]:
# reviews = reviews[reviews.comment.apply(len) < 500]
In [ ]:
reviews.comment.apply(lambda s: len(s.split())).hist(bins=30)
In [10]:
reviews.shape
Out[10]:
In [11]:
from sklearn.model_selection import train_test_split
TEST_SIZE = 0.2
X_train, X_test, y_train, y_test = train_test_split(reviews.comment, reviews.reting,
test_size=TEST_SIZE, random_state=42)
In [12]:
reviews_train = pd.DataFrame(X_train)
reviews_train['reting'] = y_train
reviews_test = pd.DataFrame(X_test)
reviews_test['reting'] = y_test
In [13]:
reviews_train.to_csv('data/reviews_train.csv', index=False, encoding='utf-8')
reviews_test.to_csv('data/reviews_test.csv', index=False, encoding='utf-8')
In [14]:
from sentence_processor import SentenceProcessor
In [16]:
DIR = 'data/w2v_models/'
MODEL_NAME = 'all.norm-sz100-w10-cb0-it1-min100.w2v'
w2v_path = DIR + MODEL_NAME
sentence_processor = SentenceProcessor(w2v_path)
In [26]:
from tqdm import tqdm_notebook
# creating a dictionary of words with frequency counting
dictionary = {}
for comment in tqdm_notebook(reviews_train.comment):
bag_of_words = sentence_processor.process(comment)
for word in bag_of_words:
dictionary.setdefault(word, 0)
dictionary[word] += 1
In [35]:
garbage_list = []
for word, value in dictionary.items():
if value > 1500:
print ('{} -> {}'.format(word, value))
garbage_list.append(word)
In [40]:
print (garbage_list)