In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
import warnings
warnings.filterwarnings("ignore", message="UndefinedMetricWarning")

In [3]:
import sys
sys.path.append("../src/")
import utils

In [4]:
from tabulate import tabulate

import seaborn as sns
import pandas as pd

from gensim.models.word2vec import Word2Vec
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold

In [5]:
data = pd.read_csv('~/cloud/data/mvideo/X_train.csv')
data.columns = ['product_id', 'category_level1', 'category_level2', 'brand', 'property', 'user_name', 'rating', 'date',
               'review', 'negative', 'positive']
data['date'] = pd.to_datetime(data.date)

In [6]:
from gensim.models.wrappers import FastText

In [7]:
w2v_model = FastText()
w2v_model = w2v_model.load_fasttext_format("../external/wiki_w2v_256.bin")

In [14]:
%%timeit
[w2v_model.wv.word_vec(word) for word in data.review[0].split()]


190 µs ± 4.42 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

In [15]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn import neighbors

In [16]:
X = utils.Word2VecTfidfEmbeddingVectorizer(w2v_model).fit_transform(data.review[:100])

In [18]:
X.shape


Out[18]:
(100, 256)

In [ ]:
class KNNDistribution(BaseEstimator, ClassifierMixin):
    
    def init(self):
    
    def fit(self, )

In [27]:
w2v_model.wv.similar_by_word('андроид')


Out[27]:
[('андроида', 0.5150728225708008),
 ('завершившимся', 0.3617841899394989),
 ('мороз', 0.3526996076107025),
 ('мальчиком,', 0.34675005078315735),
 ('Amor', 0.33932390809059143),
 ('андоррский', 0.3385862112045288),
 ('Карибских', 0.3353103697299957),
 ('ангел»,', 0.33428820967674255),
 ('соответствующем', 0.3300006687641144),
 ('баскетболу', 0.32978034019470215)]

In [28]:
w2v = {w: vec for w, vec in zip(w2v_model.wv.index2word, w2v_model.wv.syn0)}

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier

etree_w2v = Pipeline([("word2vec vectorizer", utils.MeanEmbeddingVectorizer(w2v)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=100, class_weight='balanced'))])
etree_w2v_tfidf = Pipeline([("word2vec vectorizer", utils.TfidfEmbeddingVectorizer(w2v)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=100, class_weight='balanced'))])
gboost_w2v = Pipeline([("word2vec vectorizer", utils.MeanEmbeddingVectorizer(w2v)), 
                        ("extra trees", GradientBoostingClassifier(n_estimators=100))])
gboost_w2v_tfidf = Pipeline([("word2vec vectorizer", utils.TfidfEmbeddingVectorizer(w2v)), 
                        ("extra trees", GradientBoostingClassifier(n_estimators=100))])

In [30]:
# start with the classics - naive bayes of the multinomial and bernoulli varieties
# with either pure counts or tfidf features
mult_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer='word')), ("multinomial nb", MultinomialNB())])
bern_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer='word')), ("bernoulli nb", BernoulliNB())])
mult_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer='word')), ("multinomial nb", MultinomialNB())])
bern_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer='word')), ("bernoulli nb", BernoulliNB())])
# SVM - which is supposed to be more or less state of the art 
# http://www.cs.cornell.edu/people/tj/publications/joachims_98a.pdf
svc = Pipeline([("count_vectorizer", CountVectorizer(analyzer='word')), ("linear svc", SVC(kernel="linear"))])
svc_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer='word')), ("linear svc", SVC(kernel="linear"))])

In [31]:
X = data.review
y = data.rating.round().astype(int)

In [32]:
all_models = [
#     ("etree_w2v", etree_w2v),
#     ("etree_w2v_tfidf", etree_w2v_tfidf),
    ("mult_nb", mult_nb),
    ("mult_nb_tfidf", mult_nb_tfidf),
    ("bern_nb", bern_nb),
    ("bern_nb_tfidf", bern_nb_tfidf),
    ("svc", svc),
    ("svc_tfidf", svc_tfidf),
#     ("gboost_w2v", gboost_w2v),
#     ("gboost_w2v_tfidf", gboost_w2v_tfidf),
#     ("glove_small", etree_glove_small), 
#     ("glove_small_tfidf", etree_glove_small_tfidf),
#     ("glove_big", etree_glove_big), 
#     ("glove_big_tfidf", etree_glove_big_tfidf),
    
]

scores = []
for name, model in all_models:
    print(name)
    scores.append((name, cross_validate(model, X, y, cv=StratifiedKFold(10), n_jobs=-1, 
                                        scoring=['f1_micro', 'f1_macro', 'f1_weighted'], return_train_score=True)))
#     scores.append((name, cross_val_score(model, X, y, cv=StratifiedKFold(10), n_jobs=-1, scoring=).mean()) )

toshow = []
for score in scores:
    toshow.append([score[0], score[1]['test_f1_macro'].mean(), score[1]['test_f1_micro'].mean(), score[1]['test_f1_weighted'].mean()])


print(tabulate(toshow, floatfmt=".4f", headers=("model", 'f1_macro', 'f1_micro', 'f1_weighted')))


mult_nb
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
mult_nb_tfidf
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
bern_nb
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
bern_nb_tfidf
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
svc
svc_tfidf
model            f1_macro    f1_micro    f1_weighted
-------------  ----------  ----------  -------------
mult_nb            0.2613      0.6233         0.5287
mult_nb_tfidf      0.1495      0.5960         0.4452
bern_nb            0.2030      0.5891         0.4859
bern_nb_tfidf      0.2030      0.5891         0.4859
svc                0.3763      0.5968         0.5907
svc_tfidf          0.3723      0.6598         0.6053

In [132]:
utils.Word2VecMeanEmbeddingVectorizer(w2v_model=w2v_model)


Out[132]:
Word2VecMeanEmbeddingVectorizer(w2v_model=None)

In [16]:
data = pd.read_pickle('../processed/normalized.pkl.gz')

In [19]:
# X = data[[col for col in data.columns if not 'rating' in col]].sum(1).apply(lambda x: ' '.join(x))
X = data.review_alpha_splitted_nostopwords_spellcorrected_normalized.apply(lambda x: ' '.join(x))
y = data.rating.round().astype(int)

In [20]:
all_models = [
    ("etree_w2v", etree_w2v),
    ("etree_w2v_tfidf", etree_w2v_tfidf),
    ("mult_nb", mult_nb),
    ("mult_nb_tfidf", mult_nb_tfidf),
    ("bern_nb", bern_nb),
    ("bern_nb_tfidf", bern_nb_tfidf),
    ("svc", svc),
    ("svc_tfidf", svc_tfidf),
    ("gboost_w2v", gboost_w2v),
    ("gboost_w2v_tfidf", gboost_w2v_tfidf),
#     ("glove_small", etree_glove_small), 
#     ("glove_small_tfidf", etree_glove_small_tfidf),
#     ("glove_big", etree_glove_big), 
#     ("glove_big_tfidf", etree_glove_big_tfidf),
    
]

scores = []
for name, model in all_models:
    print(name)
    scores.append((name, cross_validate(model, X, y, cv=StratifiedKFold(10), n_jobs=-1, 
                                        scoring=['f1_micro', 'f1_macro', 'f1_weighted'], return_train_score=True)))
#     scores.append((name, cross_val_score(model, X, y, cv=StratifiedKFold(10), n_jobs=-1, scoring=).mean()) )

toshow = []
for score in scores:
    toshow.append([score[0], score[1]['test_f1_macro'].mean(), score[1]['test_f1_micro'].mean(), score[1]['test_f1_weighted'].mean()])


print(tabulate(toshow, floatfmt=".4f", headers=("model", 'f1_macro', 'f1_micro', 'f1_weighted')))


etree_w2v
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
etree_w2v_tfidf
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
mult_nb
mult_nb_tfidf
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
bern_nb
bern_nb_tfidf
svc
svc_tfidf
gboost_w2v
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
gboost_w2v_tfidf
model               f1_macro    f1_micro    f1_weighted
----------------  ----------  ----------  -------------
etree_w2v             0.1546      0.5973         0.4490
etree_w2v_tfidf       0.1517      0.5965         0.4476
mult_nb               0.3319      0.6404         0.5813
mult_nb_tfidf         0.1502      0.5975         0.4474
bern_nb               0.2653      0.5995         0.5251
bern_nb_tfidf         0.2653      0.5995         0.5251
svc                   0.3891      0.6005         0.5989
svc_tfidf             0.3839      0.6613         0.6125
gboost_w2v            0.1675      0.5965         0.4558
gboost_w2v_tfidf      0.1666      0.5955         0.4556

In [33]:
for k, v in scores[-3][1].items():
    print([k, v.mean()])


['fit_time', 108.47259149551391]
['score_time', 19.826808667182924]
['test_f1_micro', 0.66134388584960568]
['train_f1_micro', 0.83519329316190194]
['test_f1_macro', 0.38394177283502334]
['train_f1_macro', 0.74584213764639973]
['test_f1_weighted', 0.6125198039472205]
['train_f1_weighted', 0.82164514318013937]

In [10]:
from imblearn import pipeline, over_sampling

In [11]:
clf = psvc_tfidf = pipeline.Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer='word')), 
                             ("resampling", over_sampling.RandomOverSampler(random_state=42)),
                             ("linear svc", SVC(kernel="linear"))])

In [12]:
res = cross_validate(clf, X, y, cv=StratifiedKFold(10), n_jobs=4, 
                                        scoring=['f1_micro', 'f1_macro', 'f1_weighted'], return_train_score=True)

In [14]:
for k, v in res.items():
    print([k, v.mean()])


['fit_time', 990.82176380157466]
['score_time', 43.880285596847536]
['test_f1_micro', 0.62846379153653165]
['train_f1_micro', 0.96095359539721326]
['test_f1_macro', 0.4190454802548681]
['train_f1_macro', 0.96518849028288645]
['test_f1_weighted', 0.62757058333413607]
['train_f1_weighted', 0.96137061851770633]

In [16]:
new_df = pd.read_csv('/Users/pavelerofeev/cloud/data/mvideo/feedback-1.csv/X_train.csv')

In [53]:
new_df.head(10)


Out[53]:
sku categoryLevel1Id categoryLevel2Id brandId property userName reting date comment commentNegative commentPositive
0 20005023 401 4010201 826 [{34: 'f982777489055c6563d68c005fd24aad'}, {36... b2898a81b45310b30beb8fc0c0a9ce1e 2.0 2013-06-28 2,5 года работала и все...устала! Лампочка гор... NaN NaN
1 20020647 403 4030101 1425 [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36... 538c73d64461e13907bb95c51c38bfbc 2.0 2010-07-04 Через 2 месяца после истечении гарантийного ср... NaN NaN
2 20020701 401 4010401 124 [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36... ddca2d0101513a6209db7868eed8be05 4.0 2010-05-27 пользуюсь уже три недели. нареканий ни каких н... NaN NaN
3 30012256 203 2030301 93 [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36... 289c20015b3713a82ba5ddf774d996f7 5.0 2016-10-11 Ребят этот системный блок подойдёт для игры кс... NaN NaN
4 30011341 205 2050201 656 [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36... 5576f82d149d4f688644fef2322c63ef 5.0 2010-02-26 я считаю, что яри замечательный телефон! Прият... NaN NaN
5 20023626 405 4050102 829 [{12671: 'e04af96afe53462f72f39331b209a810'}, ... 2daa7d6326bc2918c6dd46e35ace6b6d 1.0 2014-05-21 Сегодня купила 2 таких вентилятора! Ужасный. С... NaN NaN
6 30011639 203 2030201 995 [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36... d6c4575df3a246fe76b975dec9201a84 5.0 2010-02-27 привезли ноут, по качеству корпуса и дизайну 5... NaN NaN
7 20022938 404 4040203 759 [{897: 'e4da3b7fbbce2345d7772b0674a318d5'}, {2... 720167647dffa170deceda1549c0c906 2.0 2015-01-16 Купила этот увлажнитель, шумный, подсветка меш... NaN NaN
8 20007867 406 4060101 48 [{769: '6b101662e3fb18552fa38924077c789a'}, {1... 7bb1d51e3697a6f9596e9c817c421962 5.0 2010-08-06 Комбайн отличный. Пользуюсь 2 года. Пользуюсь ... NaN NaN
9 20002766 412 4120101 11 [{34: 'f982777489055c6563d68c005fd24aad'}, {36... 009763d797b076d86bc5ce262cbf1de0 5.0 2012-07-23 Отличный аппарат. авто выключение вообще преле... NaN NaN

In [54]:
new_df.userName.value_counts().sort_values(ascending=False)


Out[54]:
3d801da09e7d82668e226799d9db91dc    630
d6c4575df3a246fe76b975dec9201a84    522
ea0134702859871d98e5756b293f12fa    441
6421219d872c945f8aa9dc434c923394    410
33b67031842d4c56d9dc03571081b77d    408
7bb1d51e3697a6f9596e9c817c421962    405
95c961e3a0c59a7fe3a95683d3d43f0b    334
37339a5698d5e3ea0b50dd53c319f8b4    283
74ef6c68d2b552c6a89a0131dcddbb83    271
cf6d74f84dd326c489a850a69459c073    262
e19c762be9927adbab00d179222e9c70    246
9ef2a20e7442b3b08184546b1297efa0    241
3d432e5d4699fbad6bb0322c5b6945a3    221
aefc9f94d6fcebdb86d41c521dd2c441    218
14fcb108d3847d262b39c7b6d55d5915    216
72fdba974a7160bc2df192ca02021d1e    211
eb8b4da0a5519e12ead4ba90f11b91a0    200
ce62f60b85c785202e72bcd25d2b0f78    179
b2898a81b45310b30beb8fc0c0a9ce1e    169
560e1a279bae50a0248e626224828b4f    164
970095abe0b5c87fb9109719a9d2347a    161
4bc347b999f87509dae2dd9dca8b62ba    158
289c20015b3713a82ba5ddf774d996f7    150
75eac03352656ca882791865a1ee158c    150
acd41b5fc27242d19c244185ba6732f2    148
35dd87248d6c6f9ca5c9a4587e67b6b9    145
63d508b0eeacbf9a4daa07eb17ea613a    134
eae5330f0bde468318e9aead0a393e71    126
7ea66561a8992aafa1309efd2936eb65    125
ec35b71f5df6e38934d327d2f381f966    123
                                   ... 
d22db36b9a3ef696fd067177d61e66a1      1
ce431ce7d563fc1dfc9447049fa89bf3      1
2ffe4e77325d9a7152f7086ea7aa5114      1
dafe2b010165dee6330a73167bcd6b43      1
6a29e01b604dfee42541043346aab28b      1
730df4bda3492ef45f57fd41af2a2dbd      1
9c76d208184f44d015830bf3a04650b9      1
77056d0c9f55a3fc7695aa789d73dca9      1
4195d4babdc1815b0306bf59cc6c9998      1
d9a5195ecd450cd926c62ef68c7937c2      1
53dde8b00be71e9b0dbc7e8463003914      1
52ca932c9067d95f32f7c3132ebee6de      1
71ee03cc326f9056586f952a9b2df0c3      1
f3abb86bd34cf4d52698f14c0da1dc60      1
9e7f840af62b1f6ce500a10545a8b9a1      1
e864a59d5d3fb7d439dd4ddd4797ab22      1
e3260e29607c26082914d3c3c45d1029      1
28202c8dc21c0d39ff12631ecf6be5e8      1
e53672cb76968b4525da9983fa9a09ff      1
41d4fdd5530908c89fbaa3a7c1b93c57      1
f0544a6185d7fa2c883e106f6efad5ff      1
add415f86d0d120db4faa67d20637321      1
3ca6fc7b5d05b023209a5e83bf92e17a      1
9d1791061fdd7cd7f025d328b6b17422      1
bccaa0160c6ca217b3d90d57f06e88db      1
38ec68d3d0b84ec3bc46bce8c1c70cc3      1
898ee50ff519b9eec2327e3597cdad5b      1
40687c8206d15373954d8b27c6724f62      1
f9115d92ef3c30f0600fb1c90be0b37c      1
f23a370aeabb8f2e5f870d144d25c04e      1
Name: userName, Length: 3232, dtype: int64

In [23]:
hash("Алексей")


Out[23]:
7823269791567407573

In [20]:
new_df.property


Out[20]:
0        [{34: 'f982777489055c6563d68c005fd24aad'}, {36...
1        [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...
2        [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...
3        [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...
4        [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...
5        [{12671: 'e04af96afe53462f72f39331b209a810'}, ...
6        [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...
7        [{897: 'e4da3b7fbbce2345d7772b0674a318d5'}, {2...
8        [{769: '6b101662e3fb18552fa38924077c789a'}, {1...
9        [{34: 'f982777489055c6563d68c005fd24aad'}, {36...
10       [{34: 'f982777489055c6563d68c005fd24aad'}, {36...
11       [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...
12       [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...
13       [{439: '3cfcdd17dfddb03e17fb5cd09a571502'}, {3...
14       [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...
15       [{12590: 'd6c153249031cfb763929c15dc245694'}, ...
16       [{3: '26657d5ff9020d2abefe558796b99584'}, {1: ...
17       [{34: '7e9669405ffffb748b1478cd6c0ab102'}, {36...
18       [{34: 'f982777489055c6563d68c005fd24aad'}, {36...
19       [{439: '3cfcdd17dfddb03e17fb5cd09a571502'}, {1...
20       [{9928: 'e04af96afe53462f72f39331b209a810'}, {...
21       [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...
22       [{34: '7a853f97c39133b28d15e74ba0d36225'}, {36...
23       [{439: '3cfcdd17dfddb03e17fb5cd09a571502'}, {4...
24       [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...
25       [{1: '069d3bb002acd8d7dd095917f9efe4cb'}, {2: ...
26       [{34: 'f982777489055c6563d68c005fd24aad'}, {36...
27       [{115: 'd645920e395fedad7bbbed0eca3fe2e0'}, {1...
28       [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...
29       [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...
                               ...                        
15557    [{34: 'f982777489055c6563d68c005fd24aad'}, {36...
15558    [{34: 'f982777489055c6563d68c005fd24aad'}, {36...
15559    [{3: '3ef815416f775098fe977004015c6193'}, {1: ...
15560    [{9766: 'fe7f71d0f87c199c2883338525b44567'}, {...
15561    [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...
15562    [{3: '3ef815416f775098fe977004015c6193'}, {1: ...
15563    [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...
15564    [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...
15565    [{13500: 'e04af96afe53462f72f39331b209a810'}, ...
15566    [{34: '059dacbb68be281264a1ec6d7ec08ebe'}, {36...
15567        [{10909: '54b188f258ccb81d9d43ec736525ae57'}]
15568    [{3: '26657d5ff9020d2abefe558796b99584'}, {1: ...
15569    [{34: '84b6219e2c6df16fa16abe613cc8e855'}, {36...
15570    [{114: '74e8cf2f9d8ec0040beb07c9a82a295d'}, {3...
15571    [{7853: '99bb2ae15a76e0cbb258c9e2b17be20c'}, {...
15572    [{34: 'f982777489055c6563d68c005fd24aad'}, {36...
15573    [{324: 'c4ca4238a0b923820dcc509a6f75849b'}, {8...
15574    [{34: 'f982777489055c6563d68c005fd24aad'}, {36...
15575    [{34: 'f982777489055c6563d68c005fd24aad'}, {36...
15576    [{3: '33e75ff09dd601bbe69f351039152189'}, {1: ...
15577    [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...
15578    [{480: '2dd63a260fb75526413c69d2c1ef9bdd'}, {7...
15579    [{3: '6f4922f45568161a8cdf4ad2299f6d23'}, {1: ...
15580    [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...
15581    [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...
15582    [{439: '3cfcdd17dfddb03e17fb5cd09a571502'}, {3...
15583    [{34: '7e9669405ffffb748b1478cd6c0ab102'}, {36...
15584    [{115: '03cf87174debaccd689c90c34577b82f'}, {1...
15585    [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...
15586    [{34: '9ce895413ebdf6b6dcb69b07dc782591'}, {36...
Name: property, Length: 15587, dtype: object

In [24]:
new_df.reting.value_counts()


Out[24]:
5.0    9211
4.0    2609
1.0    1468
3.0    1245
2.0     854
4.7      78
4.3      54
3.7      16
3.3      16
2.7      15
2.3      14
1.3       6
1.7       1
Name: reting, dtype: int64

In [26]:
import json

In [38]:
import ast
decoded_object= ast.literal_eval(new_df.property[0])

In [51]:
def dsum(*dicts):
    ret = defaultdict(str)
    for d in dicts:
        for k, v in d.items():
            ret[k] += v
    return dict(ret)

In [52]:
dsum(*decoded_object)


Out[52]:
{1: '072b030ba126b2f4b2374f342be9ed44',
 2: 'c0c7c76d30bd3dcaefc96f40275bdc0a',
 3: 'd09bf41544a3365a46c9077ebb5e35c3',
 5: '0c701eb812b630ad4b1de479c5647926',
 14: '860f03a9d8852be8cf3727694c82847c',
 34: 'f982777489055c6563d68c005fd24aad',
 36: 'ba58121b2b2fb5703c3c3597c35d7e7c',
 114: '358e19f99fcb914c5aa9697b59a21429',
 115: '320722549d1751cf3f247855f937b982',
 162: 'd310cb367d993fb6fb584b198a2fd72c',
 183: 'e04af96afe53462f72f39331b209a810',
 342: 'bd12dacbe0580ab0a8842f68fae0a890',
 353: 'e04af96afe53462f72f39331b209a810',
 468: '12f1d7c953e2e33d273404052b02b9fc',
 7641: 'c1f3039316f52acf187ae52be4a2badc',
 7743: '9e9c149882696cd40e2cc4c026804678',
 7843: '92fb0c6d1758261f10d052e6e2c1123c',
 7844: 'b1eec33c726a60554bc78518d5f9b32c',
 7845: 'e04af96afe53462f72f39331b209a810',
 7846: 'e04af96afe53462f72f39331b209a810',
 7853: '3209de58d9d0ec28b1778116d58516d4',
 7991: 'e5ddc6b684de0c9d0259a1455760e582',
 10879: 'fc490ca45c00b1249bbe3554a4fdf6fb',
 10880: 'd09bf41544a3365a46c9077ebb5e35c3',
 11093: '46675ef3c0904af8897e7431a759ae80',
 15593: '0c701eb812b630ad4b1de479c5647926'}

In [47]:
sum([defaultdict(str) + val for val in decoded_object])


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-47-fd6675882c74> in <module>()
----> 1 sum([defaultdict(str) + val for val in decoded_object])

<ipython-input-47-fd6675882c74> in <listcomp>(.0)
----> 1 sum([defaultdict(str) + val for val in decoded_object])

TypeError: unsupported operand type(s) for +: 'collections.defaultdict' and 'dict'

In [31]:
json.loads(new_df.property[0][1:-1])


---------------------------------------------------------------------------
JSONDecodeError                           Traceback (most recent call last)
<ipython-input-31-c0840961037a> in <module>()
----> 1 json.loads(new_df.property[0][1:-1])

~/anaconda/lib/python3.6/json/__init__.py in loads(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    352             parse_int is None and parse_float is None and
    353             parse_constant is None and object_pairs_hook is None and not kw):
--> 354         return _default_decoder.decode(s)
    355     if cls is None:
    356         cls = JSONDecoder

~/anaconda/lib/python3.6/json/decoder.py in decode(self, s, _w)
    337 
    338         """
--> 339         obj, end = self.raw_decode(s, idx=_w(s, 0).end())
    340         end = _w(s, end).end()
    341         if end != len(s):

~/anaconda/lib/python3.6/json/decoder.py in raw_decode(self, s, idx)
    353         """
    354         try:
--> 355             obj, end = self.scan_once(s, idx)
    356         except StopIteration as err:
    357             raise JSONDecodeError("Expecting value", s, err.value) from None

JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)

In [ ]:
import md

In [58]:
new_df.property.apply(lambda x: dsum(*ast.literal_eval(x))).apply(pd.Series)


Out[58]:
1 2 3 4 5 6 11 14 17 19 ... 15659 15711 15852 15916 15917 15921 15945 16014 16029 16078
0 072b030ba126b2f4b2374f342be9ed44 c0c7c76d30bd3dcaefc96f40275bdc0a d09bf41544a3365a46c9077ebb5e35c3 NaN 0c701eb812b630ad4b1de479c5647926 NaN NaN 860f03a9d8852be8cf3727694c82847c NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 072b030ba126b2f4b2374f342be9ed44 d645920e395fedad7bbbed0eca3fe2e0 3ef815416f775098fe977004015c6193 NaN b49680152a65c56db99139d7699eadf1 7a53928fa4dd31e82c6ef826f341daec e4da3b7fbbce2345d7772b0674a318d5 860f03a9d8852be8cf3727694c82847c NaN 19a1147c5b24c423f73289c0ad378a0f ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 1a5b1e4daae265b790965a275b53ae50 01f78be6f7cad02658508fe4616098a9 f0adc8838f4bdedde4ec2cfad0515589 NaN NaN NaN NaN 3194c31320d9f8bb0c370ac2076bf01c d82c8d1619ad8176d665453cfb2e55f0 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 d3d9446802a44259755d38e6d163e820 a5bfc9e07964f8dddeb95fc584cd965d 02e74f10e0327ad868d138f2b4fdd6f0 NaN 1b537a10d34038004ef1ed9c6b372f70 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 642e92efb79421734881b53e1e1b18b6 c74d97b01eae257e44aa9d5bade97baf f899139df5e1059396431415e770c6dd NaN c8583aea8edf82e166c9f43ecee66062 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
5 2838023a778dfaecdc212708f721b788 2838023a778dfaecdc212708f721b788 9b8619251a19057cff70779273e95aa6 NaN b49680152a65c56db99139d7699eadf1 NaN NaN 860f03a9d8852be8cf3727694c82847c NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
6 58238e9ae2dd305d79c2ebc8c1883422 f340f1b1f65b6df5b5e3f94d95b11daf a5771bce93e200c36f7cd9dfd0e5deaa NaN 7b1c6a4f78be1faf7af2cccf20496883 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
7 NaN NaN NaN NaN 7b1c6a4f78be1faf7af2cccf20496883 NaN NaN 860f03a9d8852be8cf3727694c82847c NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
8 NaN NaN NaN NaN b49680152a65c56db99139d7699eadf1 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
9 NaN NaN NaN NaN c8583aea8edf82e166c9f43ecee66062 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
10 NaN NaN NaN NaN b49680152a65c56db99139d7699eadf1 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
11 d9d4f495e875a2e075a1a4a6e1b9770f 1c383cd30b7c298ab50293adfecb7b18 6ea9ab1baa0efb9e19094440c317e21b NaN 0c701eb812b630ad4b1de479c5647926 NaN NaN 3194c31320d9f8bb0c370ac2076bf01c NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
12 a5771bce93e200c36f7cd9dfd0e5deaa 3c59dc048e8850243be8079a5c74d079 d645920e395fedad7bbbed0eca3fe2e0 NaN 1b537a10d34038004ef1ed9c6b372f70 NaN NaN 3194c31320d9f8bb0c370ac2076bf01c NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
13 45c48cce2e2d7fbdea1afc51c7c6ad26 45c48cce2e2d7fbdea1afc51c7c6ad26 70efdf2ec9b086079795c442636b55fb NaN 1b537a10d34038004ef1ed9c6b372f70 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
14 NaN NaN NaN NaN 7b1c6a4f78be1faf7af2cccf20496883 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15 9a1158154dfa42caddbd0694a4e9bdc8 9a1158154dfa42caddbd0694a4e9bdc8 069059b7ef840f0c74a814ec9237b6ec NaN e3bd181b28cac904c6c580fc608474e1 NaN NaN 860f03a9d8852be8cf3727694c82847c NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
16 a684eceee76fc522773286a895bc8436 45c48cce2e2d7fbdea1afc51c7c6ad26 26657d5ff9020d2abefe558796b99584 NaN bbb17e03647b9c4ce705052ba1dfdd93 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
17 c74d97b01eae257e44aa9d5bade97baf c16a5320fa475530d9583c34fd356ef5 e4da3b7fbbce2345d7772b0674a318d5 NaN bbb17e03647b9c4ce705052ba1dfdd93 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
18 6364d3f0f495b6ab9dcf8d3b5c6e0b01 c0c7c76d30bd3dcaefc96f40275bdc0a 7cbbc409ec990f19c78c75bd1e06f215 NaN bbb17e03647b9c4ce705052ba1dfdd93 NaN NaN NaN c0c7c76d30bd3dcaefc96f40275bdc0a NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
19 NaN NaN NaN NaN 42f6cdf222dc3b42840900722e62e3e4 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
20 f033ab37c30201f73f142449d037028d b6d767d2f8ed5d21a44b0e5886680cb9 b53b3a3d6ab90ce0268229151c9bde11 NaN 1b537a10d34038004ef1ed9c6b372f70 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN 49629ae07443afb8e37a14eb529fde3d NaN
21 ac627ab1ccbdb62ec96e702f07f6425b ad9995c0d305d650bc1a138f32113af7 14bfa6bb14875e45bba028a21ed38046 NaN 1b537a10d34038004ef1ed9c6b372f70 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
22 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
23 e2c420d928d4bf8ce0ff2ec19b371514 aab3238922bcc25a6f606eb525ffdc56 33e75ff09dd601bbe69f351039152189 NaN 7b1c6a4f78be1faf7af2cccf20496883 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
24 04ecb1fa28506ccb6f72b12c0245ddbc 2b8a61594b1f4c4db0902a8a395ced93 04ecb1fa28506ccb6f72b12c0245ddbc NaN 2d362b46dd7fb0e14f3f4776be8d0200 NaN NaN 3194c31320d9f8bb0c370ac2076bf01c NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
25 069d3bb002acd8d7dd095917f9efe4cb 087408522c31eeb1f982bc0eaf81d35f NaN NaN 0c701eb812b630ad4b1de479c5647926 NaN NaN 860f03a9d8852be8cf3727694c82847c NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
26 NaN NaN NaN NaN 68b3ccaab313c35fef49f0d929c9251a NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
27 NaN NaN NaN NaN 8c101ca01320da5fc1acd72ead869c48 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
28 NaN NaN NaN NaN 715bb2608e8e4728b16e3b829c2c737c NaN NaN 3194c31320d9f8bb0c370ac2076bf01c 3ef815416f775098fe977004015c6193 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
29 NaN NaN NaN NaN ceafc9afc787ad155dd9957c6bcdc6a8 NaN NaN 860f03a9d8852be8cf3727694c82847c NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
15557 NaN NaN NaN NaN bd172c9f09a29711b716fc81dcf819e0 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15558 NaN NaN NaN NaN 028551516a722dae1b7329a1bfa7b56a NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15559 072b030ba126b2f4b2374f342be9ed44 072b030ba126b2f4b2374f342be9ed44 3ef815416f775098fe977004015c6193 0c701eb812b630ad4b1de479c5647926 b49680152a65c56db99139d7699eadf1 9e984c108157cea74c894b5cf34efc44 e4da3b7fbbce2345d7772b0674a318d5 3194c31320d9f8bb0c370ac2076bf01c NaN 19a1147c5b24c423f73289c0ad378a0f ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15560 072b030ba126b2f4b2374f342be9ed44 c74d97b01eae257e44aa9d5bade97baf 2b44928ae11fb9384c4cf38708677c48 NaN 1b537a10d34038004ef1ed9c6b372f70 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15561 6ea9ab1baa0efb9e19094440c317e21b 1f0e3dad99908345f7439f8ffabdffc4 98f13708210194c475687be6106a3b84 NaN 028551516a722dae1b7329a1bfa7b56a NaN NaN 3194c31320d9f8bb0c370ac2076bf01c NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15562 072b030ba126b2f4b2374f342be9ed44 a1d0c6e83f027327d8461063f4ac58a6 3ef815416f775098fe977004015c6193 NaN b49680152a65c56db99139d7699eadf1 a9b7ba70783b617e9998dc4dd82eb3c5 c33ab40f1472ef16492879f9a7bbf170 66ba7953f34ab0ebeb2fa40e99ed6412 NaN 19a1147c5b24c423f73289c0ad378a0f ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15563 9bf31c7ff062936a96d3c8bd1f8f2ff3 1f0e3dad99908345f7439f8ffabdffc4 6ea9ab1baa0efb9e19094440c317e21b NaN b49680152a65c56db99139d7699eadf1 NaN NaN 860f03a9d8852be8cf3727694c82847c NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15564 c0c7c76d30bd3dcaefc96f40275bdc0a a684eceee76fc522773286a895bc8436 3ef815416f775098fe977004015c6193 NaN 1d772875828ce00fe92b160660a50c86 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15565 NaN NaN NaN NaN 2d362b46dd7fb0e14f3f4776be8d0200 NaN NaN 860f03a9d8852be8cf3727694c82847c NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15566 NaN NaN NaN NaN dbcc63237bad19109a8bd3105c2f7c7c NaN NaN 860f03a9d8852be8cf3727694c82847c 68d30a9594728bc39aa24be94b319d21 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15567 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15568 a684eceee76fc522773286a895bc8436 45c48cce2e2d7fbdea1afc51c7c6ad26 26657d5ff9020d2abefe558796b99584 NaN bbb17e03647b9c4ce705052ba1dfdd93 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15569 NaN NaN NaN NaN 7c1c96b42ae4dc63ef7293efd7cb5015 NaN NaN 3194c31320d9f8bb0c370ac2076bf01c f033ab37c30201f73f142449d037028d NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15570 072b030ba126b2f4b2374f342be9ed44 9f61408e3afb633e50cdf1b20de6f466 3ef815416f775098fe977004015c6193 NaN b49680152a65c56db99139d7699eadf1 a9b7ba70783b617e9998dc4dd82eb3c5 1679091c5a880faf6fb5e6087eb1b2dc 3194c31320d9f8bb0c370ac2076bf01c NaN 19a1147c5b24c423f73289c0ad378a0f ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15571 d490d7b4576290fa60eb31b5fc917ad1 6ea2ef7311b482724a9b7b0bc0dd85c6 2fc3dc9b351df9fe377e5fac7b27ce4e NaN 0c701eb812b630ad4b1de479c5647926 NaN NaN 3194c31320d9f8bb0c370ac2076bf01c NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15572 c16a5320fa475530d9583c34fd356ef5 37693cfc748049e45d87b8c7d8b9aacd 6c8349cc7260ae62e3b1396831a8398f NaN 4d7bafb7e00f1f168ad9e8c76562a8a5 NaN NaN 3194c31320d9f8bb0c370ac2076bf01c NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15573 NaN NaN NaN NaN 1b537a10d34038004ef1ed9c6b372f70 NaN NaN 3194c31320d9f8bb0c370ac2076bf01c NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15574 NaN NaN NaN NaN 42f6cdf222dc3b42840900722e62e3e4 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15575 NaN NaN NaN NaN d3468f693b18d74057a7aaf663fb299d NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15576 3c59dc048e8850243be8079a5c74d079 02e74f10e0327ad868d138f2b4fdd6f0 33e75ff09dd601bbe69f351039152189 NaN 907ebb84ccb6822cbb51cca1ba7c1a59 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15577 c0c7c76d30bd3dcaefc96f40275bdc0a 072b030ba126b2f4b2374f342be9ed44 3ef815416f775098fe977004015c6193 NaN 2d362b46dd7fb0e14f3f4776be8d0200 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15578 076a0c97d09cf1a0ec3e19c7f2529f2b 99c5e07b4d5de9d18c350cdf64c5aa3d 076a0c97d09cf1a0ec3e19c7f2529f2b NaN 1b537a10d34038004ef1ed9c6b372f70 NaN NaN 66ba7953f34ab0ebeb2fa40e99ed6412 NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15579 02e74f10e0327ad868d138f2b4fdd6f0 02e74f10e0327ad868d138f2b4fdd6f0 6f4922f45568161a8cdf4ad2299f6d23 NaN b49680152a65c56db99139d7699eadf1 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15580 67c6a1e7ce56d3d6fa748ab6d9af3fd7 6f4922f45568161a8cdf4ad2299f6d23 d67d8ab4f4c10bf22aa353e27879133c NaN b49680152a65c56db99139d7699eadf1 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN ee02b954af0871b794fa46eb01dc9c14 NaN
15581 ffd52f3c7e12435a724a8f30fddadd9c 3cec07e9ba5f5bb252d13f5f431e4bbb d645920e395fedad7bbbed0eca3fe2e0 NaN 1b537a10d34038004ef1ed9c6b372f70 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15582 a1d0c6e83f027327d8461063f4ac58a6 1c383cd30b7c298ab50293adfecb7b18 44f683a84163b3523afe57c2e008bc8c 3cfcdd17dfddb03e17fb5cd09a571502 2441c85ecd5a49dc4c1e971604c7fe32 NaN c81e728d9d4c2f636f067f89cc14862c 860f03a9d8852be8cf3727694c82847c NaN c6c1fb2aa832bc741e94f2b79e345b10 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15583 f033ab37c30201f73f142449d037028d 3c59dc048e8850243be8079a5c74d079 6ea9ab1baa0efb9e19094440c317e21b NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15584 NaN NaN NaN NaN bbb17e03647b9c4ce705052ba1dfdd93 NaN NaN 3194c31320d9f8bb0c370ac2076bf01c NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15585 6f4922f45568161a8cdf4ad2299f6d23 33e75ff09dd601bbe69f351039152189 6ea9ab1baa0efb9e19094440c317e21b NaN b49680152a65c56db99139d7699eadf1 NaN NaN 860f03a9d8852be8cf3727694c82847c NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15586 NaN NaN NaN NaN aaf651b7a34b3f9a7514c7cb303e1931 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

15587 rows × 2752 columns


In [57]:
new_df.sku.value_counts().sort_values(ascending=False)


Out[57]:
20004117    25
20021836    24
20003216    24
20027107    24
20024708    24
20003253    24
20022880    24
20004128    24
20023315    23
20002836    23
20021560    23
20000734    23
20003536    23
30010871    23
20021906    23
20004269    23
20004358    23
20021873    23
30017520    23
20003201    23
20008331    23
20003275    23
30011115    22
20003445    22
20022938    22
50035098    22
20021901    22
20008561    22
20021910    22
20022342    22
            ..
20001176     1
20001052     1
50004382     1
40049627     1
11031514     1
50033622     1
20020922     1
40027903     1
50004054     1
50036255     1
40054904     1
20001072     1
20002086     1
40056961     1
40057187     1
40054968     1
20002763     1
40011192     1
11000004     1
11007627     1
20020798     1
50034254     1
50034299     1
11007550     1
30011122     1
40056642     1
50007620     1
20020866     1
30004871     1
40056274     1
Name: sku, Length: 2698, dtype: int64

In [ ]: