Ratings

In this notebook we'll plot the company ratings.

To see how the data was obtained, go to the Scraping notebook. For further analysis, go to the Topic Modeling notebook.


In [ ]:
from pymongo import MongoClient
import datetime
import reviews_data
# from nltk.tokenize import sent_tokenize, word_tokenize
# from nltk.corpus import stopwords
# import re
# import string
# from gensim import corpora, models, similarities
import matplotlib.pyplot as plt
import utils
import numpy as np

In [ ]:
# DB settings
client = MongoClient()
indeed_db = client.indeed
indeed_jobs = indeed_db.jobs
indeed_reviews = indeed_db.reviews
glassdoor_db = client.glassdoor
glassdoor_reviews = glassdoor_db.reviews

In [ ]:
companies = list(set(utils.get_company_names(indeed_reviews)))
ratings = reviews_data.all_ratings(indeed_reviews, glassdoor_reviews)

kept_ratings = []
# Drop ratings without any rating information
for rating in ratings:
    if len(rating) > 1:
        kept_ratings.append(rating)
ratings = kept_ratings
len(ratings)

In [ ]:
ratings[0]

In [ ]:
# returns the ratings for a company
# Possible keys: 'Comp & Benefits', 'Culture & Values','Career Opportunities', 
# 'Senior Management', 'Work/Life Balance', and 'rating'
def get_company_ratings(company, ratings, key):
    company_ratings = []
    for rating in ratings:
        if rating['company'].lower() == company.lower() and key in rating:
            company_ratings.append(float(rating[key]))
    return company_ratings

# Plot ratings histograms
def draw_scores(scores, title="", show = True, save = False):
    if len(scores) == 0:
        print "No scores to plot"
        return
    hist = np.histogram(scores,bins=5,range=[0.01,5.01])[0] # for rounding
    draw_hist(hist, title, show, save)

def draw_hist(hist, title = "", show = True, save = False):
    plt.figure(figsize=(8,6))
    plt.bar([x+0.6 for x in range(5)], hist)
    plt.title(title,fontsize=25)
    plt.xlabel("Stars",fontsize=16)
    plt.ylabel("Number of reviews",fontsize=16)
    plt.xticks([0,1,2,3,4,5])
    plt.tick_params(labelsize=12)
    if save:
        plt.savefig('images/' + title.replace('/','-').replace(' ','_') + ".png")
    if show: plt.show()
    plt.close()
    return
    
def draw_all_company_rating(rating_key, companies_list, title_end):
    for company in companies_list:
        scores = get_company_ratings(company, ratings, rating_key)
        title = company + title_end
        draw_scores(scores, title, False, True);

Plot 1 company's ratings

There are rating several categories: 'Career Opportunities', 'Compensation and Benefits', 'Culture and Values', 'Management', 'Work and Life Balance' and 'rating'.


In [ ]:
%matplotlib inline

company = companies[15]
rating_key = 'rating'

title = company + " overall ratings"
scores = get_company_ratings(company, ratings, rating_key)
draw_scores(scores, title)

rating_keys = ['Career Opportunities', 'Compensation and Benefits',
               'Culture and Values', 'Management', 'Work and Life Balance']
for rating_key in rating_keys:
    title = company + " " + rating_key + " ratings"
    scores = get_company_ratings(company, ratings, rating_key)
    draw_scores(scores, title)

Plot all companies' ratings

Plot all of the companies' ratings and save them as images.


In [ ]:
rating_key = 'rating'
title_end = ' overall ratings'
draw_all_company_rating(rating_key, companies, title_end)

Overall Ratings


In [ ]:
import numpy as np

# average will divide by the number of ratings
def get_all_ratings(rating_key, average = False):
    total_ratings = np.zeros(5)
    for company in companies:
        company_ratings = get_company_ratings(company, ratings, rating_key)
        hist = np.histogram(company_ratings,bins=5,range=[0.01,5.01])[0]
        if average:
            hist = hist.astype(float) / sum(hist)
        total_ratings += hist
    return total_ratings

In [ ]:
ratings_key = 'rating'
total_ratings = get_all_ratings(rating_key)
draw_hist(total_ratings, "All Overall Ratings", True, True)

In [ ]:
titles = ["All " + r + " Ratings" for r in rating_keys]

for key,title in zip(rating_keys, titles):
    total_ratings = get_all_ratings(key)
    draw_hist(total_ratings, title, True, True)

In [ ]:
# Now without weighting by number of employees
ratings_key = 'rating'
total_ratings = get_all_ratings(rating_key, True)
draw_hist(total_ratings, "All Overall Ratings Average", True, True)

In [ ]:
titles = ["All " + r + " Ratings Average" for r in rating_keys]

for key,title in zip(rating_keys, titles):
    total_ratings = get_all_ratings(key, True)
    draw_hist(total_ratings, title, True, True)