notebook.community

Edit and run



In [251]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import seaborn as sns
import re
from sklearn.feature_extraction import DictVectorizer
sns.set(color_codes=True)

%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [191]:

    
PATH_TO_FOLDER = "../Yelp/Data/"
BUSINESS = "part_businesses.csv"
TIPS = "part_tips.csv"
USERS = "part_users.csv"
REVIEWS = "part_reviews.csv"



In [192]:

    
reviews_df = pd.read_csv(PATH_TO_FOLDER+REVIEWS)
business_df = pd.read_csv(PATH_TO_FOLDER+BUSINESS)
tips_df = pd.read_csv(PATH_TO_FOLDER+TIPS)

tips_df = tips_df[["business_id", "date", "likes", "user_id"]] # Удаляем текст


business_df = business_df[["business_id", "city"]]
business_df.columns = ["business_id", "business_city"]



In [193]:

    
tips_df.head(1)









    Out[193]:






  
    
      
      business_id
      date
      likes
      user_id
    
  
  
    
      0
      mVHrayjG3uZ_RLHkLj-AMg
      2013-01-06
      0
      EZ0r9dKKtEGVx2CdnowPCw



In [194]:

    
reviews_df.head(1)









    Out[194]:






  
    
      
      business_id
      date
      review_id
      stars
      text
      user_id
      vote_cool
      vote_funny
      vote_useful
    
  
  
    
      0
      vcNAWiLM4dR7D2nwwJ7nCA
      2010-03-22
      RF6UnRTtG7tWMcrO2GEoAg
      2
      Unfortunately, the frustration of being Dr. Go...
      H1kH6QZV7Le4zqTRNxoZow
      0
      0
      2

Посмотрим, как распределены оценки за ревью



In [195]:

    
figure(3, figsize=(6,6))
stars_count = reviews_df.groupby("stars")["stars"].count()
labels = ["1", "2", "3", "4", "5"]
explode=(0.03, 0.03, 0.03, 0.03, 0.03)
pie(start_count, labels=labels, shadow=True, startangle=90, explode=explode)
plt.title('Stars of Reviews', bbox={'facecolor':'0.9', 'pad':5})
plt.show()

Видно, что количество положительных оценок превосходит количество отрицательных



In [ ]:

Посчитаем количество слов. Построим график, который покажет, как оценка за рецензию зависит от количества слов в рецензии.



In [173]:

    
def get_count_of_words(text):
    return text.count(' ')+1

reviews_df["count_of_words"] = reviews_df.text.map(lambda s: get_count_of_words(s))



In [174]:

    
a = reviews_df.groupby(["stars", "count_of_words"]).count_of_words.count()
step = reviews_df["count_of_words"].max()
pd.DataFrame(a).plot(kind='barh')
frame = pylab.gca()
pylab.title("STARS | Count of words")
pylab.xlabel("Count of words")
pylab.ylabel("Stars")
frame.axes.get_yaxis().set_ticks([])
frame.legend_.remove()

plt.yticks([step*(i+1)/2 for i in range(5)], ["1","2","3","4","5"], rotation='horizontal');
plt.show()

На это графике видно, что если человее написал в своем обзоре больше 150 слов, то он, однозначно, получит положительную за нее оценку.



In [ ]:

Считаем ревью людей по городам



In [196]:

    
reviews_df = reviews_df.merge(business_df, on='business_id', left_index=True, right_index=False)



In [197]:

    
# Сколько раз для какого города пользователи делали обзоры
business_city_count = pd.DataFrame(reviews_df.groupby(["user_id", "business_city"])["business_city"].count())
business_city_count.columns = ["reviews_for_city"]
business_city_count.reset_index(inplace=True)



In [198]:

    
# Общее количество ревью для каждого пользователя
review_count = pd.DataFrame(reviews_df.groupby("user_id")["user_id"].count())
review_count.columns = ["reviews_count"]
review_count.reset_index(inplace=True)

Построим гистограмму людей по количеству людей по ревью



In [199]:

    
review_count_array = review_count["reviews_count"].as_matrix()
max_ = max(review_count_array)
print max_
plt.figure(figsize=(8,8))
plt.hist(review_count_array, bins=max_)
plt.yscale('log')
plt.xticks([i for i in range(max_+1)])
plt.title('Review for person')
plt.xlabel('Count of review')
plt.ylabel('Count of people')
review_count_array = 0



In [200]:

    
user_city_df = review_count.merge(business_city_count, on='user_id', left_index=True, right_index=False)



In [201]:

    
user_city_df.head(1)









    Out[201]:






  
    
      
      user_id
      reviews_count
      business_city
      reviews_for_city
    
  
  
    
      0
      --65q1FpAL_UQtVZ2PTGew
      4
      Phoenix
      3



In [202]:

    
def is_only_one_city(count1, count2):
    return 1 if (count1==count2) else 0
    
user_city_df["is_only_one_city"] = user_city_df.apply(lambda s: is_only_one_city(s["reviews_count"], s["reviews_for_city"]), axis=1)

Сколько людей писали review только в одном городе, а сколько в нескольких



In [203]:

    
figure(3, figsize=(6,6))
count_of_users = user_city_df[["user_id", "is_only_one_city"]]
count_of_users = count_of_users.drop_duplicates()


count_uniq = count_of_users["is_only_one_city"].value_counts()

labels = ["Only one city", "Several cities"]
explode=(0.05, 0)
pie(count_uniq, labels=labels, shadow=True, startangle=90, explode=explode)
plt.title('How many cities was reviewed', bbox={'facecolor':'0.9', 'pad':5})
plt.show()

count_of_users = 0

Сделаем тоже самое для TIPS



In [204]:

    
tips_city_df = tips_df.merge(business_df, on='business_id', left_index=True, right_index=False)



In [212]:

    
# Сколько раз для какого города пользователи делали tips
tips_city_count = pd.DataFrame(tips_city_df.groupby(["user_id", "business_city"])["business_city"].count())
tips_city_count.columns = ["tips_for_city"]
tips_city_count.reset_index(inplace=True)



In [214]:

    
# Общее количество tips для каждого пользователя
tips_count = pd.DataFrame(tips_city_df.groupby("user_id")["user_id"].count())
tips_count.columns = ["tips_count"]
tips_count.reset_index(inplace=True)



In [216]:

    
tips_city_df = tips_city_count.merge(tips_count, on='user_id', left_index=True, right_index=False)



In [218]:

    
def is_only_one_city(count1, count2):
    return 1 if (count1==count2) else 0
    
tips_city_df["is_only_one_city"] = tips_city_df.apply(lambda s: is_only_one_city(s["tips_for_city"], s["tips_count"]), axis=1)
tips_city_df.head()









    Out[218]:






  
    
      
      user_id
      business_city
      tips_for_city
      tips_count
      is_only_one_city
    
  
  
    
      0
      --65q1FpAL_UQtVZ2PTGew
      Las Vegas
      1
      2
      0
    
    
      0
      --65q1FpAL_UQtVZ2PTGew
      Phoenix
      1
      2
      0
    
    
      1
      --f43ruUt7LBeB3aU74z-w
      Charlotte
      1
      1
      1
    
    
      2
      -2OWhxjHGfjArURE6ABhJQ
      Las Vegas
      1
      1
      1
    
    
      3
      -2jevGd5B6dqAT7AwBW6lA
      Phoenix
      1
      1
      1



In [224]:

    
tips_count_array = tips_city_df["tips_count"].as_matrix()
max_ = max(tips_count_array)
print max_
plt.figure(figsize=(8,8))
plt.hist(tips_count_array, bins=max_)
plt.yscale('log')
plt.xticks([i for i in range(max_+1)])
plt.title('Review for person')
plt.xlabel('Count of review')
plt.ylabel('Count of people')
review_count_array = 0



In [222]:

    
figure(3, figsize=(6,6))
count_of_users = tips_city_df[["user_id", "is_only_one_city"]]
count_of_users = count_of_users.drop_duplicates()


count_uniq = count_of_users["is_only_one_city"].value_counts()

labels = ["Only one city", "Several cities"]
explode=(0.05, 0)
pie(count_uniq, labels=labels, shadow=True, startangle=90, explode=explode)
plt.title('How many cities was tipped', bbox={'facecolor':'0.9', 'pad':5})
plt.show()

count_of_users = 0

Построим облако тегов для Tips и Reviews



In [305]:

    
def get_words(text):
    """returns list of words"""
    text = re.sub(r"_+", '', text) # Удаляем __, они, почему-то, входят в \w
    text = re.sub("\w*\d\w*", "", text) # Удаляем все цифры и слова с цифрами
    array = re.findall(r'[\w]+', text) # Выделяем слова
    return array


def get_tokens(words):
    """returns list of tokens"""
    stop_words = stopwords.words('english')
    wordnet_lemmatizer = WordNetLemmatizer()
    
    lems = []
    for w in words:
        w = w.lower()
        if w not in stop_words:
            w = wordnet_lemmatizer.lemmatize(w)
            lems.append(w)
    return lems


def get_df_tokens(df):
    STEP = 2000
    df_step = df[df.index<STEP]
    i = 1
    
    row = {}
    while(len(df_step)):
        text_array = df_step["text"]
        text_array =  text_array.as_matrix()
        for text in text_array:
            text_array = get_words(text)
            tokens = get_tokens(text_array)
            
            for token in tokens:
                if token in row:
                    row[token] += 1
                else:
                    row[token] = 1
                    
        df_step = df[(df.index >= i*STEP) & (df.index<(i+1)*STEP)]
        i+=1
    return row


def draw_tag_cloud(tokens):
        
    list_of_couple = []
    for key, value in tokens.iteritems():
        list_of_couple.append((key, value))
    
    list_of_couple.sort(key=lambda tup: tup[1], reverse=True)
    wordcloud = WordCloud(background_color="white", max_words=200000, width=1000, height=700).generate_from_frequencies(list_of_couple)
    plt.figure(figsize=(14, 10))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

Построим облако тегов для TIPS



In [301]:

    
tokens = get_df_tokens(tips_df)
draw_tag_cloud(tokens)



In [ ]:

    
tokens = get_df_tokens(tips_df)
draw_tag_cloud(tokens)

Построим облако тегов для REVIEW



In [306]:

    
reviews_df = pd.read_csv(PATH_TO_FOLDER+REVIEWS)
tokens = get_df_tokens(reviews_df)
draw_tag_cloud(tokens)



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
star_and_count = reviews_df.groupby(["stars", "count_of_words"]).count_of_words.count()
star_and_count = pd.DataFrame(star_and_count)
star_and_count.columns = ["freq"]
star_and_count.reset_index(inplace=True)

STEPS_COUNT = 40
max_ = star_and_count.stars.max()
step = max_/STEPS_COUNT+1

hist_array = []

for star in range(star_and_count.stars.max()):
    star_i = star_and_count[star_and_count["stars"]==star+1]
    star_i["label"] = pd.cut(star_i["count_of_words"], [i*25 for i in range(60)])
    
    count_of_words_in_interval = star_i.groupby("label")["freq"].sum().as_matrix()
    count_of_words_in_interval = np.nan_to_num(count_of_words_in_interval)
    hist_array.append(count_of_words_in_interval)
    print len(count_of_words_in_interval)



In [189]:

    
for hi in hist_array:
    plt.bar(hi, range(len(hi)), alpha=0.6)
    plt.yscale('log')
    plt.show()

#len(hi)
#star_i["label"].unique()



In [182]:

    
len(hi)









    Out[182]:





59



In [147]:

    
a1["label"] = pd.cut(a1["count_of_words"], [i*25 for i in range(50)])









    



/home/popka/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [152]:

    
s = pd.cut(a1["count_of_words"], [i*25 for i in range(50)])



In [156]:

    
s.unique()









    Out[156]:





[(0, 25], (25, 50], (50, 75], (75, 100], (100, 125], ..., (900, 925], (925, 950], (950, 975], (975, 1000], (1000, 1025]]
Length: 41
Categories (41, object): [(0, 25] < (25, 50] < (50, 75] < (75, 100] ... (925, 950] < (950, 975] < (975, 1000] < (1000, 1025]]



In [148]:

    
np.nan_to_num(a1.groupby("label")["freq"].sum().as_matrix())









    Out[148]:





array([ 1153.,  2190.,  2070.,  1872.,  1517.,  1217.,  1032.,   851.,
         666.,   531.,   467.,   372.,   303.,   250.,   213.,   182.,
         158.,   130.,   105.,   105.,    73.,    63.,    50.,    49.,
          46.,    39.,    30.,    29.,    22.,    19.,     8.,    15.,
          27.,    10.,    10.,    14.,    14.,    14.,    11.,     7.,
           5.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.])

Идеи:

Длина текста + оценка (зависимости длины текста от оценки)
Облако тегов из текста. Можно для разного количества звезд.
Кто чаще получает положительные оценки - иностранцы или нет



In [7]:

    
x = np.random.normal(size=100)
sns.distplot(x);
sns.distplot(x/2);
sns.distplot(x/3);



In [7]:

    
pd.cut









    Out[7]:





<function pandas.tools.tile.cut>



In [62]:

    
d = pd.DataFrame(a)
d.columns = ["words_count"]



In [70]:

    
d.reset_index(inplace=True)
d = d[d["stars"]==1]



In [69]:









    



  File "<ipython-input-69-5a06a4a4e17d>", line 1
    d.groupby(pd.cut(d["count_of_words"]).head()
                                                ^
SyntaxError: unexpected EOF while parsing



In [26]:

    
def get_tokens(words):
    """returns list of tokens"""
    stop_words = stopwords.words('english')
    wordnet_lemmatizer = WordNetLemmatizer()
    
    lems = []
    for w in words:
        w = w.lower()
        if w not in stop_words:
            w = wordnet_lemmatizer.lemmatize(w)
            lems.append(w)
    return lems



In [ ]:



In [50]:

    
d.groupby(pd.cut(d["words_count"], bins=[i*25 for i in range(10)])).count()









    Out[50]:






  
    
      
      stars
      count_of_words
      words_count
    
    
      words_count
      
      
      
    
  
  
    
      (0, 25]
      2524
      2524
      2524
    
    
      (25, 50]
      391
      391
      391
    
    
      (50, 75]
      293
      293
      293
    
    
      (75, 100]
      182
      182
      182
    
    
      (100, 125]
      106
      106
      106
    
    
      (125, 150]
      54
      54
      54
    
    
      (150, 175]
      36
      36
      36
    
    
      (175, 200]
      32
      32
      32
    
    
      (200, 225]
      27
      27
      27



In [35]:

    
s = pd.DataFrame(a)
s.columns = ["freq"]
s.reset_index(inplace=True)



In [44]:









    Out[44]:






  
    
      
      stars
      count_of_words
      freq
    
  
  
    
      0
      1
      1
      32
    
    
      1
      1
      2
      12
    
    
      2
      1
      3
      26
    
    
      3
      1
      4
      30
    
    
      4
      1
      5
      22
    
    
      5
      1
      6
      30
    
    
      6
      1
      7
      27
    
    
      7
      1
      8
      33
    
    
      8
      1
      9
      34
    
    
      9
      1
      10
      41
    
    
      10
      1
      11
      33
    
    
      11
      1
      12
      38
    
    
      12
      1
      13
      39
    
    
      13
      1
      14
      40
    
    
      14
      1
      15
      37
    
    
      15
      1
      16
      56
    
    
      16
      1
      17
      53
    
    
      17
      1
      18
      82
    
    
      18
      1
      19
      66
    
    
      19
      1
      20
      71
    
    
      20
      1
      21
      70
    
    
      21
      1
      22
      73
    
    
      22
      1
      23
      68
    
    
      23
      1
      24
      77
    
    
      24
      1
      25
      63
    
    
      25
      1
      26
      78
    
    
      26
      1
      27
      94
    
    
      27
      1
      28
      76
    
    
      28
      1
      29
      81
    
    
      29
      1
      30
      87
    
    
      ...
      ...
      ...
      ...
    
    
      3786
      5
      899
      1
    
    
      3787
      5
      900
      2
    
    
      3788
      5
      901
      1
    
    
      3789
      5
      904
      1
    
    
      3790
      5
      909
      1
    
    
      3791
      5
      913
      1
    
    
      3792
      5
      916
      2
    
    
      3793
      5
      917
      1
    
    
      3794
      5
      918
      1
    
    
      3795
      5
      919
      2
    
    
      3796
      5
      922
      1
    
    
      3797
      5
      924
      1
    
    
      3798
      5
      925
      2
    
    
      3799
      5
      927
      1
    
    
      3800
      5
      929
      1
    
    
      3801
      5
      931
      1
    
    
      3802
      5
      934
      1
    
    
      3803
      5
      944
      1
    
    
      3804
      5
      945
      1
    
    
      3805
      5
      946
      2
    
    
      3806
      5
      953
      2
    
    
      3807
      5
      956
      1
    
    
      3808
      5
      961
      1
    
    
      3809
      5
      962
      1
    
    
      3810
      5
      971
      1
    
    
      3811
      5
      980
      1
    
    
      3812
      5
      984
      2
    
    
      3813
      5
      997
      1
    
    
      3814
      5
      1098
      1
    
    
      3815
      5
      1340
      1
    
  

3816 rows × 3 columns



In [49]:

    
m = s[s["stars"]==1]
#m.groupby(pd.cut(s["count_of_words"], bins=[i*25 for i in range(10)])).count()









    Out[49]:






  
    
      
      stars
      count_of_words
      freq
    
    
      count_of_words
      
      
      
    
  
  
    
      (0, 25]
      25
      25
      25
    
    
      (25, 50]
      25
      25
      25
    
    
      (50, 75]
      25
      25
      25
    
    
      (75, 100]
      25
      25
      25
    
    
      (100, 125]
      25
      25
      25
    
    
      (125, 150]
      25
      25
      25
    
    
      (150, 175]
      25
      25
      25
    
    
      (175, 200]
      25
      25
      25
    
    
      (200, 225]
      25
      25
      25



In [53]:

    
for i in range(5):
    print s[s["stars"]==i+1].groupby(pd.cut(s["count_of_words"], bins=[i*25 for i in range(10)]))









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-53-377823c06eb1> in <module>()
      1 for i in range(5):
----> 2     print s[s["stars"]==i+1].groupby(pd.cut(s["count_of_words"], bins=[i*25 for i in range(10)]))

/home/popka/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in groupby(self, by, axis, level, as_index, sort, group_keys, squeeze)
   3434         axis = self._get_axis_number(axis)
   3435         return groupby(self, by=by, axis=axis, level=level, as_index=as_index,
-> 3436                        sort=sort, group_keys=group_keys, squeeze=squeeze)
   3437 
   3438     def asfreq(self, freq, method=None, how=None, normalize=False):

/home/popka/anaconda/lib/python2.7/site-packages/pandas/core/groupby.pyc in groupby(obj, by, **kwds)
   1309         raise TypeError('invalid type: %s' % type(obj))
   1310 
-> 1311     return klass(obj, by, **kwds)
   1312 
   1313 

/home/popka/anaconda/lib/python2.7/site-packages/pandas/core/groupby.pyc in __init__(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, squeeze)
    416         if grouper is None:
    417             grouper, exclusions, obj = _get_grouper(obj, keys, axis=axis,
--> 418                                                     level=level, sort=sort)
    419 
    420         self.obj = obj

/home/popka/anaconda/lib/python2.7/site-packages/pandas/core/groupby.pyc in _get_grouper(obj, key, axis, level, sort)
   2269 
   2270         if is_categorical_dtype(gpr) and len(gpr) != len(obj):
-> 2271             raise ValueError("Categorical dtype grouper must have len(grouper) == len(data)")
   2272 
   2273         ping = Grouping(group_axis, gpr, obj=obj, name=name,

ValueError: Categorical dtype grouper must have len(grouper) == len(data)

	user_id	business_city	tips_for_city	tips_count	is_only_one_city
0	--65q1FpAL_UQtVZ2PTGew	Las Vegas	1	2	0
0	--65q1FpAL_UQtVZ2PTGew	Phoenix	1	2	0
1	--f43ruUt7LBeB3aU74z-w	Charlotte	1	1	1
2	-2OWhxjHGfjArURE6ABhJQ	Las Vegas	1	1	1
3	-2jevGd5B6dqAT7AwBW6lA	Phoenix	1	1	1

	stars	count_of_words	words_count
words_count
(0, 25]	2524	2524	2524
(25, 50]	391	391	391
(50, 75]	293	293	293
(75, 100]	182	182	182
(100, 125]	106	106	106
(125, 150]	54	54	54
(150, 175]	36	36	36
(175, 200]	32	32	32
(200, 225]	27	27	27

	stars	count_of_words	freq
0	1	1	32
1	1	2	12
2	1	3	26
3	1	4	30
4	1	5	22
5	1	6	30
6	1	7	27
7	1	8	33
8	1	9	34
9	1	10	41
10	1	11	33
11	1	12	38
12	1	13	39
13	1	14	40
14	1	15	37
15	1	16	56
16	1	17	53
17	1	18	82
18	1	19	66
19	1	20	71
20	1	21	70
21	1	22	73
22	1	23	68
23	1	24	77
24	1	25	63
25	1	26	78
26	1	27	94
27	1	28	76
28	1	29	81
29	1	30	87
...	...	...	...
3786	5	899	1
3787	5	900	2
3788	5	901	1
3789	5	904	1
3790	5	909	1
3791	5	913	1
3792	5	916	2
3793	5	917	1
3794	5	918	1
3795	5	919	2
3796	5	922	1
3797	5	924	1
3798	5	925	2
3799	5	927	1
3800	5	929	1
3801	5	931	1
3802	5	934	1
3803	5	944	1
3804	5	945	1
3805	5	946	2
3806	5	953	2
3807	5	956	1
3808	5	961	1
3809	5	962	1
3810	5	971	1
3811	5	980	1
3812	5	984	2
3813	5	997	1
3814	5	1098	1
3815	5	1340	1

	stars	count_of_words	freq
count_of_words
(0, 25]	25	25	25
(25, 50]	25	25	25
(50, 75]	25	25	25
(75, 100]	25	25	25
(100, 125]	25	25	25
(125, 150]	25	25	25
(150, 175]	25	25	25
(175, 200]	25	25	25
(200, 225]	25	25	25