In [251]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import seaborn as sns
import re
from sklearn.feature_extraction import DictVectorizer
sns.set(color_codes=True)

%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [191]:
PATH_TO_FOLDER = "../Yelp/Data/"
BUSINESS = "part_businesses.csv"
TIPS = "part_tips.csv"
USERS = "part_users.csv"
REVIEWS = "part_reviews.csv"

In [192]:
reviews_df = pd.read_csv(PATH_TO_FOLDER+REVIEWS)
business_df = pd.read_csv(PATH_TO_FOLDER+BUSINESS)
tips_df = pd.read_csv(PATH_TO_FOLDER+TIPS)

tips_df = tips_df[["business_id", "date", "likes", "user_id"]] # Удаляем текст


business_df = business_df[["business_id", "city"]]
business_df.columns = ["business_id", "business_city"]

In [193]:
tips_df.head(1)


Out[193]:
business_id date likes user_id
0 mVHrayjG3uZ_RLHkLj-AMg 2013-01-06 0 EZ0r9dKKtEGVx2CdnowPCw

In [194]:
reviews_df.head(1)


Out[194]:
business_id date review_id stars text user_id vote_cool vote_funny vote_useful
0 vcNAWiLM4dR7D2nwwJ7nCA 2010-03-22 RF6UnRTtG7tWMcrO2GEoAg 2 Unfortunately, the frustration of being Dr. Go... H1kH6QZV7Le4zqTRNxoZow 0 0 2

Посмотрим, как распределены оценки за ревью


In [195]:
figure(3, figsize=(6,6))
stars_count = reviews_df.groupby("stars")["stars"].count()
labels = ["1", "2", "3", "4", "5"]
explode=(0.03, 0.03, 0.03, 0.03, 0.03)
pie(start_count, labels=labels, shadow=True, startangle=90, explode=explode)
plt.title('Stars of Reviews', bbox={'facecolor':'0.9', 'pad':5})
plt.show()


Видно, что количество положительных оценок превосходит количество отрицательных


In [ ]:

Посчитаем количество слов. Построим график, который покажет, как оценка за рецензию зависит от количества слов в рецензии.


In [173]:
def get_count_of_words(text):
    return text.count(' ')+1

reviews_df["count_of_words"] = reviews_df.text.map(lambda s: get_count_of_words(s))

In [174]:
a = reviews_df.groupby(["stars", "count_of_words"]).count_of_words.count()
step = reviews_df["count_of_words"].max()
pd.DataFrame(a).plot(kind='barh')
frame = pylab.gca()
pylab.title("STARS | Count of words")
pylab.xlabel("Count of words")
pylab.ylabel("Stars")
frame.axes.get_yaxis().set_ticks([])
frame.legend_.remove()

plt.yticks([step*(i+1)/2 for i in range(5)], ["1","2","3","4","5"], rotation='horizontal');
plt.show()


На это графике видно, что если человее написал в своем обзоре больше 150 слов, то он, однозначно, получит положительную за нее оценку.


In [ ]:

Считаем ревью людей по городам


In [196]:
reviews_df = reviews_df.merge(business_df, on='business_id', left_index=True, right_index=False)

In [197]:
# Сколько раз для какого города пользователи делали обзоры
business_city_count = pd.DataFrame(reviews_df.groupby(["user_id", "business_city"])["business_city"].count())
business_city_count.columns = ["reviews_for_city"]
business_city_count.reset_index(inplace=True)

In [198]:
# Общее количество ревью для каждого пользователя
review_count = pd.DataFrame(reviews_df.groupby("user_id")["user_id"].count())
review_count.columns = ["reviews_count"]
review_count.reset_index(inplace=True)

Построим гистограмму людей по количеству людей по ревью


In [199]:
review_count_array = review_count["reviews_count"].as_matrix()
max_ = max(review_count_array)
print max_
plt.figure(figsize=(8,8))
plt.hist(review_count_array, bins=max_)
plt.yscale('log')
plt.xticks([i for i in range(max_+1)])
plt.title('Review for person')
plt.xlabel('Count of review')
plt.ylabel('Count of people')
review_count_array = 0


19

In [200]:
user_city_df = review_count.merge(business_city_count, on='user_id', left_index=True, right_index=False)

In [201]:
user_city_df.head(1)


Out[201]:
user_id reviews_count business_city reviews_for_city
0 --65q1FpAL_UQtVZ2PTGew 4 Phoenix 3

In [202]:
def is_only_one_city(count1, count2):
    return 1 if (count1==count2) else 0
    
user_city_df["is_only_one_city"] = user_city_df.apply(lambda s: is_only_one_city(s["reviews_count"], s["reviews_for_city"]), axis=1)

Сколько людей писали review только в одном городе, а сколько в нескольких


In [203]:
figure(3, figsize=(6,6))
count_of_users = user_city_df[["user_id", "is_only_one_city"]]
count_of_users = count_of_users.drop_duplicates()


count_uniq = count_of_users["is_only_one_city"].value_counts()

labels = ["Only one city", "Several cities"]
explode=(0.05, 0)
pie(count_uniq, labels=labels, shadow=True, startangle=90, explode=explode)
plt.title('How many cities was reviewed', bbox={'facecolor':'0.9', 'pad':5})
plt.show()

count_of_users = 0


Сделаем тоже самое для TIPS


In [204]:
tips_city_df = tips_df.merge(business_df, on='business_id', left_index=True, right_index=False)

In [212]:
# Сколько раз для какого города пользователи делали tips
tips_city_count = pd.DataFrame(tips_city_df.groupby(["user_id", "business_city"])["business_city"].count())
tips_city_count.columns = ["tips_for_city"]
tips_city_count.reset_index(inplace=True)

In [214]:
# Общее количество tips для каждого пользователя
tips_count = pd.DataFrame(tips_city_df.groupby("user_id")["user_id"].count())
tips_count.columns = ["tips_count"]
tips_count.reset_index(inplace=True)

In [216]:
tips_city_df = tips_city_count.merge(tips_count, on='user_id', left_index=True, right_index=False)

In [218]:
def is_only_one_city(count1, count2):
    return 1 if (count1==count2) else 0
    
tips_city_df["is_only_one_city"] = tips_city_df.apply(lambda s: is_only_one_city(s["tips_for_city"], s["tips_count"]), axis=1)
tips_city_df.head()


Out[218]:
user_id business_city tips_for_city tips_count is_only_one_city
0 --65q1FpAL_UQtVZ2PTGew Las Vegas 1 2 0
0 --65q1FpAL_UQtVZ2PTGew Phoenix 1 2 0
1 --f43ruUt7LBeB3aU74z-w Charlotte 1 1 1
2 -2OWhxjHGfjArURE6ABhJQ Las Vegas 1 1 1
3 -2jevGd5B6dqAT7AwBW6lA Phoenix 1 1 1

In [224]:
tips_count_array = tips_city_df["tips_count"].as_matrix()
max_ = max(tips_count_array)
print max_
plt.figure(figsize=(8,8))
plt.hist(tips_count_array, bins=max_)
plt.yscale('log')
plt.xticks([i for i in range(max_+1)])
plt.title('Review for person')
plt.xlabel('Count of review')
plt.ylabel('Count of people')
review_count_array = 0


27

In [222]:
figure(3, figsize=(6,6))
count_of_users = tips_city_df[["user_id", "is_only_one_city"]]
count_of_users = count_of_users.drop_duplicates()


count_uniq = count_of_users["is_only_one_city"].value_counts()

labels = ["Only one city", "Several cities"]
explode=(0.05, 0)
pie(count_uniq, labels=labels, shadow=True, startangle=90, explode=explode)
plt.title('How many cities was tipped', bbox={'facecolor':'0.9', 'pad':5})
plt.show()

count_of_users = 0


Построим облако тегов для Tips и Reviews


In [305]:
def get_words(text):
    """returns list of words"""
    text = re.sub(r"_+", '', text) # Удаляем __, они, почему-то, входят в \w
    text = re.sub("\w*\d\w*", "", text) # Удаляем все цифры и слова с цифрами
    array = re.findall(r'[\w]+', text) # Выделяем слова
    return array


def get_tokens(words):
    """returns list of tokens"""
    stop_words = stopwords.words('english')
    wordnet_lemmatizer = WordNetLemmatizer()
    
    lems = []
    for w in words:
        w = w.lower()
        if w not in stop_words:
            w = wordnet_lemmatizer.lemmatize(w)
            lems.append(w)
    return lems


def get_df_tokens(df):
    STEP = 2000
    df_step = df[df.index<STEP]
    i = 1
    
    row = {}
    while(len(df_step)):
        text_array = df_step["text"]
        text_array =  text_array.as_matrix()
        for text in text_array:
            text_array = get_words(text)
            tokens = get_tokens(text_array)
            
            for token in tokens:
                if token in row:
                    row[token] += 1
                else:
                    row[token] = 1
                    
        df_step = df[(df.index >= i*STEP) & (df.index<(i+1)*STEP)]
        i+=1
    return row


def draw_tag_cloud(tokens):
        
    list_of_couple = []
    for key, value in tokens.iteritems():
        list_of_couple.append((key, value))
    
    list_of_couple.sort(key=lambda tup: tup[1], reverse=True)
    wordcloud = WordCloud(background_color="white", max_words=200000, width=1000, height=700).generate_from_frequencies(list_of_couple)
    plt.figure(figsize=(14, 10))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

Построим облако тегов для TIPS


In [301]:
tokens = get_df_tokens(tips_df)
draw_tag_cloud(tokens)



In [ ]:
tokens = get_df_tokens(tips_df)
draw_tag_cloud(tokens)

Построим облако тегов для REVIEW


In [306]:
reviews_df = pd.read_csv(PATH_TO_FOLDER+REVIEWS)
tokens = get_df_tokens(reviews_df)
draw_tag_cloud(tokens)