Ratings

In this notebook we'll plot the company ratings.

1 Company
Overall

To see how the data was obtained, go to the Scraping notebook. For further analysis, go to the Topic Modeling notebook.



In [ ]:

    
from pymongo import MongoClient
import datetime
import reviews_data
# from nltk.tokenize import sent_tokenize, word_tokenize
# from nltk.corpus import stopwords
# import re
# import string
# from gensim import corpora, models, similarities
import matplotlib.pyplot as plt
import utils
import numpy as np



In [ ]:

    
# DB settings
client = MongoClient()
indeed_db = client.indeed
indeed_jobs = indeed_db.jobs
indeed_reviews = indeed_db.reviews
glassdoor_db = client.glassdoor
glassdoor_reviews = glassdoor_db.reviews



In [ ]:

    
companies = list(set(utils.get_company_names(indeed_reviews)))
ratings = reviews_data.all_ratings(indeed_reviews, glassdoor_reviews)

kept_ratings = []
# Drop ratings without any rating information
for rating in ratings:
    if len(rating) > 1:
        kept_ratings.append(rating)
ratings = kept_ratings
len(ratings)



In [ ]:

    
ratings[0]



In [ ]:

    
# returns the ratings for a company
# Possible keys: 'Comp & Benefits', 'Culture & Values','Career Opportunities', 
# 'Senior Management', 'Work/Life Balance', and 'rating'
def get_company_ratings(company, ratings, key):
    company_ratings = []
    for rating in ratings:
        if rating['company'].lower() == company.lower() and key in rating:
            company_ratings.append(float(rating[key]))
    return company_ratings

# Plot ratings histograms
def draw_scores(scores, title="", show = True, save = False):
    if len(scores) == 0:
        print "No scores to plot"
        return
    hist = np.histogram(scores,bins=5,range=[0.01,5.01])[0] # for rounding
    draw_hist(hist, title, show, save)

def draw_hist(hist, title = "", show = True, save = False):
    plt.figure(figsize=(8,6))
    plt.bar([x+0.6 for x in range(5)], hist)
    plt.title(title,fontsize=25)
    plt.xlabel("Stars",fontsize=16)
    plt.ylabel("Number of reviews",fontsize=16)
    plt.xticks([0,1,2,3,4,5])
    plt.tick_params(labelsize=12)
    if save:
        plt.savefig('images/' + title.replace('/','-').replace(' ','_') + ".png")
    if show: plt.show()
    plt.close()
    return
    
def draw_all_company_rating(rating_key, companies_list, title_end):
    for company in companies_list:
        scores = get_company_ratings(company, ratings, rating_key)
        title = company + title_end
        draw_scores(scores, title, False, True);

Plot 1 company's ratings

There are rating several categories: 'Career Opportunities', 'Compensation and Benefits', 'Culture and Values', 'Management', 'Work and Life Balance' and 'rating'.



In [ ]:

    
%matplotlib inline

company = companies[15]
rating_key = 'rating'

title = company + " overall ratings"
scores = get_company_ratings(company, ratings, rating_key)
draw_scores(scores, title)

rating_keys = ['Career Opportunities', 'Compensation and Benefits',
               'Culture and Values', 'Management', 'Work and Life Balance']
for rating_key in rating_keys:
    title = company + " " + rating_key + " ratings"
    scores = get_company_ratings(company, ratings, rating_key)
    draw_scores(scores, title)

Plot all companies' ratings

Plot all of the companies' ratings and save them as images.



In [ ]:

    
rating_key = 'rating'
title_end = ' overall ratings'
draw_all_company_rating(rating_key, companies, title_end)

Overall Ratings



In [ ]:

    
import numpy as np

# average will divide by the number of ratings
def get_all_ratings(rating_key, average = False):
    total_ratings = np.zeros(5)
    for company in companies:
        company_ratings = get_company_ratings(company, ratings, rating_key)
        hist = np.histogram(company_ratings,bins=5,range=[0.01,5.01])[0]
        if average:
            hist = hist.astype(float) / sum(hist)
        total_ratings += hist
    return total_ratings



In [ ]:

    
ratings_key = 'rating'
total_ratings = get_all_ratings(rating_key)
draw_hist(total_ratings, "All Overall Ratings", True, True)



In [ ]:

    
titles = ["All " + r + " Ratings" for r in rating_keys]

for key,title in zip(rating_keys, titles):
    total_ratings = get_all_ratings(key)
    draw_hist(total_ratings, title, True, True)



In [ ]:

    
# Now without weighting by number of employees
ratings_key = 'rating'
total_ratings = get_all_ratings(rating_key, True)
draw_hist(total_ratings, "All Overall Ratings Average", True, True)



In [ ]:

    
titles = ["All " + r + " Ratings Average" for r in rating_keys]

for key,title in zip(rating_keys, titles):
    total_ratings = get_all_ratings(key, True)
    draw_hist(total_ratings, title, True, True)