In [1]:
%pylab inline
In [88]:
import pandas as pd
import requests
from html.parser import HTMLParser
from os import path
import json
import networkx as nx
from collections import Counter
import tabulate
from sklearn import model_selection, ensemble, metrics, linear_model, neighbors, pipeline, preprocessing, feature_extraction
In [5]:
data = pd.read_csv('~/cloud/data/mvideo/feedback.csv', escapechar='\\',header=None,
delimiter=',', engine='c', error_bad_lines=False)
data.columns = ['rating', 'product_id', 'user_name', 'date', 'positive', 'negative', 'review']
In [7]:
response = requests.get('http://www.mvideo.ru/products/8342', allow_redirects=True)
In [45]:
class MyHTMLParser(HTMLParser):
def __init__(self):
super().__init__()
self.breadcrumbs = False
self.chain = []
def handle_starttag(self, tag, attrs):
if tag == 'span' and ('class', 'c-breadcrumbs__item-text') in attrs:
# print("Encountered a start tag:", tag)
# print(attrs)
self.breadcrumbs = True
def handle_endtag(self, tag):
pass
# print("Encountered an end tag :", tag)
def handle_data(self, data):
if self.breadcrumbs:
# print(data)
self.chain.append(data)
self.breadcrumbs = False
# print("Encountered some data :", data)
In [54]:
dict_path = path.join('..', 'processed', 'product_categories_' +
str(hash(frozenset(data.product_id.unique()))) + '.json')
if path.exists(dict_path):
with open(dict_path, 'r') as f:
product_categories = json.load(f)
else:
product_categories = {}
for pid in data.product_id.unique():
print('|', end='')
response = requests.get('http://www.mvideo.ru/products/' + str(pid), allow_redirects=True)
parser = MyHTMLParser()
parser.feed(response.content.decode('utf-8'))
product_categories[pid] = parser.chain
product_categories = {str(k): v for k, v in product_categories.items()}
with open(dict_path, 'w') as f:
json.dump(product_categories, f)
In [53]:
with open(dict_path, 'w') as f:
json.dump(product_categories, f)
In [64]:
g = nx.DiGraph()
for lst in product_categories.values():
for node_from, node_to in zip(lst[:-1], lst[1:]):
if len(node_to) > 1 and len(node_from) > 1:
g.add_edge(node_from, node_to)
In [70]:
Counter([len(x) for x in product_categories.values()])
Out[70]:
In [73]:
categories_df = pd.Series(product_categories).apply(pd.Series)
In [85]:
categories_df.groupby(0)[1].count().sort_values(ascending=False).head(30)
Out[85]:
In [86]:
categories_df.to_pickle('../processed/product_categories_by_id.pkl.gz')
In [91]:
category_dummies = pd.get_dummies(categories_df)
In [95]:
data.product_id = data.product_id.astype(str)
In [100]:
X = pd.merge(data[['rating', 'product_id']], category_dummies, left_on='product_id', right_index=True).drop('product_id', axis=1)
In [111]:
y = data.rating.round().astype(int)
# clf = ensemble.GradientBoostingClassifier(n_estimators=100)
# clf = ensemble.RandomForestClassifier(n_estimators=100, class_weight='balanced')
# clf = linear_model.LogisticRegressionCV(penalty='l1', class_weight='balanced', solver='liblinear')
clf = pipeline.make_pipeline(preprocessing.StandardScaler(),
neighbors.KNeighborsClassifier(n_neighbors=50, weights='distance', metric='jaccard'))
y_pred = model_selection.cross_val_predict(clf, X, y,
n_jobs=-1, cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42))
print(metrics.classification_report(y, y_pred))
print(metrics.confusion_matrix(y, y_pred))
In [ ]: