Import important modules and declare important directories



In [95]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import json
import pandas as pd
import csv
import os
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats
from statsmodels.sandbox.regression.predstd import wls_prediction_std

This is a function that we'll use later to plot the results of a linear SVM classifier



In [3]:

    
def plot_coefficients(classifier, feature_names, top_features=20):
    coef = classifier.coef_.ravel()[0:200]
    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
    #create plot
    plt.figure(figsize=(15, 5))
    colors = ['red' if c < 0 else 'blue' for c in coef[top_coefficients]]
    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha='right')
    plt.show()



In [ ]:

    
def bayesian_average()



In [9]:

    
#This is the main folder where all the modules and JSON files are stored on my computer. 
#You need to change this to the folder path specific to your computer
file_directory = "/Users/robertsonwang/Desktop/Python/Yelp_class/yelp-classification/"
reviews_file = "cleaned_reviews_states_2010.json"
biz_file = "cleaned_business_data.json"

Load in the sample JSON file and view its contents



In [10]:

    
#This is a smaller subset of our overall Yelp data 
#I randomly chose 5000 reviews from each state and filed them into the JSON file
#Note that for the overall dataset, we have about 2 million reviews. 
#That's why we need to use a data management system like MongoDB in order to hold all our data 
#and to more efficiently manipulate it

reviews_json = json.load(open(file_directory+reviews_file))
biz_json = json.load(open(file_directory+biz_file))



In [11]:

    
for key in reviews_json.keys():
    reviews_json[key] = reviews_json[key][0:5000]



In [4]:

    
#Let's see how reviews_json is set up
print reviews_json.keys()









    



[u'OH', u'NC', u'WI', u'IL', u'AZ', u'NV']



In [29]:

    
#We can see that on the highest level, the dictionary keys are the different states
#Let's look at the first entry under Ohio
print reviews_json['OH'][0]['useful']









    



{u'funny': 0, u'user_id': u'kXUySHSlRgVrcR4Aa0HtGQ', u'review_id': u'tRd0-mPa9O1TMJp_dw5khQ', u'text': u'Got my mojo back after having a few of their appetite teasers. Love LPW for a no-frills bite to eat.', u'business_id': u'4P-vTvE6cncJyUyLh73pxw', u'state': u'OH', u'stars': 4, u'date': u'2014-08-14', u'useful': 0, u'type': u'review', u'cool': 0}



In [6]:

    
#So for each review filed under Ohio, we have many different attributes to choose from
#Let's look at what the review and rating was for the first review filed under Ohio
print reviews_json['OH'][0]['text']
print reviews_json['OH'][0]['stars']









    



Got my mojo back after having a few of their appetite teasers. Love LPW for a no-frills bite to eat.
4

Now, let's create two lists for all the reviews in Ohio:

One that holds all the reviews
One that holds all the ratings



In [41]:

    
#We want to split up reviews between text and labels for each state
reviews = []
stars = []
for key in reviews_json.keys():
    for review in reviews_json[key]:
        reviews.append(review['text'])
        stars.append(review['stars'])
        
#Just for demonstration, let's pick out the same review example as above but from our respective lists
print reviews[0]
print stars[0]









    



Got my mojo back after having a few of their appetite teasers. Love LPW for a no-frills bite to eat.
4

Let's take a look at the following regression (information is correlated with review length):

$Rating = \beta_{neg}neg + \beta_{pos}pos + \beta_{num}\text{Std_NumWords} + \epsilon$

Where:

$neg = \frac{\text{Number of Negative Words}}{\text{Total Number of Words}}$

$pos = \frac{\text{Number of Positive Words}}{\text{Total Number of Words}}$



In [86]:

    
harvard_dict = pd.read_csv('HIV-4.csv')
negative_words = list(harvard_dict.loc[harvard_dict['Negativ'] == 'Negativ']['Entry'])
positive_words = list(harvard_dict.loc[harvard_dict['Positiv'] == 'Positiv']['Entry'])



In [25]:

    
#Use word dictionary from Hu and Liu (2004)
negative_words = open('negative-words.txt', 'r').read()
negative_words = negative_words.split('\n')
positive_words = open('positive-words.txt', 'r').read()
positive_words = positive_words.split('\n')
total_words = negative_words + positive_words
total_words = list(set(total_words))



In [53]:

    
review_length = []
negative_percent = []
positive_percent = []
for review in reviews:
    length_words = len(review.split())
    neg_words = [x.lower() for x in review.split() if x in negative_words]
    pos_words = [x.lower() for x in review.split() if x in positive_words]
    negative_percent.append(float(len(neg_words))/float(length_words))
    positive_percent.append(float(len(pos_words))/float(length_words))
    review_length.append(length_words)



In [140]:

    
regression_df = pd.DataFrame({'stars':stars, 'review_length':review_length, 'neg_percent': negative_percent, 'positive_percent': positive_percent})



In [156]:

    
#Standardize dependent variables
std_vars = ['neg_percent', 'positive_percent', 'review_length']
for var in std_vars:
    len_std = regression_df[var].std()
    len_mu = regression_df[var].mean()
    regression_df[var] = [(x - len_mu)/len_std for x in regression_df[var]]

Let's try using Harvard-IV sentiment categories as dependent variables



In [163]:

    
#The R-Squared from using the Harvard Dictionary is 0.1 but with the Hu & Liu word dictionary
X = np.column_stack((regression_df.review_length,regression_df.neg_percent, regression_df.positive_percent))
y = regression_df.stars
X = sm.add_constant(X)

est = sm.OLS(y, X)
est2 = est.fit()
print(est2.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  stars   R-squared:                       0.212
Model:                            OLS   Adj. R-squared:                  0.212
Method:                 Least Squares   F-statistic:                     2691.
Date:                Sat, 08 Apr 2017   Prob (F-statistic):               0.00
Time:                        12:00:11   Log-Likelihood:                -49440.
No. Observations:               30000   AIC:                         9.889e+04
Df Residuals:                   29996   BIC:                         9.892e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          3.3648      0.007    463.466      0.000         3.351     3.379
x1            -0.1052      0.007    -14.205      0.000        -0.120    -0.091
x2            -0.3271      0.007    -44.304      0.000        -0.342    -0.313
x3             0.4822      0.008     64.073      0.000         0.467     0.497
==============================================================================
Omnibus:                     2456.255   Durbin-Watson:                   1.724
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              988.568
Skew:                          -0.222   Prob(JB):                    2.16e-215
Kurtosis:                       2.229   Cond. No.                         1.31
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

NOTE: BLUE Estimator does not require normality of errors Gauss-Markov Theorem states that the ordinary least squares estimate is the best linear unbiased estimator (BLUE) of the regression coefficients ('Best' meaning optimal in terms of minimizing mean squared error) as long as the errors:

(1) have mean zero
(2) are uncorrelated
(3) have constant variance



In [164]:

    
x = np.array(regression_df.stars)
#beta = [3.3648, -0.3227 , 0.5033]
y = [int(round(i)) for i in list(est2.fittedvalues)]
y = np.array(y)
errors = np.subtract(x,y)
np.sum(errors)









    Out[164]:





119



In [ ]:

    
fig, ax = plt.subplots(figsize=(5,5))
ax.plot(x, x, 'b', label="data")
ax.plot(x, y, 'o', label="ols")
#ax.plot(x, est2.fittedvalues, 'r--.', label="OLS")
#ax.plot(x, iv_u, 'r--')
#ax.plot(x, iv_l, 'r--')
ax.legend(loc='best');



In [91]:

    
#Do a QQ plot of the data
fig = sm.qqplot(errors)
plt.show()

Let's plot the overall distribution of ratings aggregated across all of the states



In [12]:

    
star_hist = pd.DataFrame({'Ratings':stars})
star_hist.plot.hist()









    Out[12]:





<matplotlib.axes._subplots.AxesSubplot at 0x153641210>

Let's plot the rating distribution of reviews within each of the states.



In [13]:

    
df_list = []
states = list(reviews_json.keys())
for state in states:
    stars_state = []
    for review in reviews_json[state]:
        stars_state.append(review['stars'])
    star_hist = pd.DataFrame({'Ratings':stars_state})
    df_list.append(star_hist)

for i in range(0, len(df_list)):
    print states[i] + " Rating Distribution"
    df_list[i].plot.hist()
    plt.show()









    



OH Rating Distribution






    












    



NC Rating Distribution






    












    



WI Rating Distribution






    












    



IL Rating Distribution






    












    



AZ Rating Distribution






    












    



NV Rating Distribution

Now let's try to build a simple linear support vector machine

Note, all support vector machine algorithm relies on drawing a separating hyperplane amongst the different classes. This is not necessarily guarenteed to exist. For a complete set of conditions that must be satisfied for this to be an appropriate algorithm to use, please see below:

http://www.unc.edu/~normanp/890part4.pdf

The following is also a good, and more general, introduction to Support Vector Machines:

http://web.mit.edu/6.034/wwwbob/svm-notes-long-08.pdf



In [36]:

    
#First let's separate out our dataset into a training sample and a test sample
#We specify a training sample percentage of 80% of our total dataset. This is just a rule of thumb 
training_percent = 0.8
train_reviews = reviews[0:int(len(reviews)*training_percent)]
test_reviews = reviews[int(len(reviews)*training_percent):len(reviews)]
train_ratings = stars[0:int(len(stars)*training_percent)]
test_ratings = stars[int(len(stars)*training_percent):len(stars)]

In order to use the machine learning algorithms in Sci-Kit learn, we first have to initialize a CountVectorizer object. We can use this object creates a matrix representation of each of our words. There are many options that we can specify when we initialize our CountVectorizer object (see documentation for full list) but they essentially all relate to how the words are represented in the final matrix.



In [37]:

    
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             vocabulary = total_words, \
                             max_features = 200) 

train_data_features = vectorizer.fit_transform(train_reviews)
test_data_features = vectorizer.fit_transform(test_reviews)

Create dataframe to hold our results from the classification algorithms



In [38]:

    
output = pd.DataFrame( data={"Reviews": test_reviews, "Rating": test_ratings} )

Lets call a linear SVM instance from SK Learn have it train on our subset of reviews. We'll output the results to an output dataframe and then calculate a total accuracy percentage.



In [26]:

    
#Let's do the same exercise as above but use TF-IDF, you can learn more about TF-IDF here:
#https://nlp.stanford.edu/IR-book/html/htmledition/tf-idf-weighting-1.html
tf_transformer = TfidfTransformer(use_idf=True)

train_data_features = tf_transformer.fit_transform(train_data_features)
test_data_features = tf_transformer.fit_transform(test_data_features)

lin_svm = lin_svm.fit(train_data_features, train_ratings)
lin_svm_result = lin_svm.predict(test_data_features)
output['lin_svm'] = lin_svm_result
output['Accurate'] = np.where(output['Rating'] == output['lin_svm'], 1, 0)
accurate_percentage = float(sum(output['Accurate']))/float(len(output))
print accurate_percentage









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-26-43f5cc2e9fd7> in <module>()
      6 test_data_features = tf_transformer.fit_transform(test_data_features)
      7 
----> 8 lin_svm = lin_svm.fit(train_data_features, train_ratings)
      9 lin_svm_result = lin_svm.predict(test_data_features)
     10 output['lin_svm'] = lin_svm_result

NameError: name 'lin_svm' is not defined



In [51]:

    
#Here we plot the features with the highest absolute value coefficient weight
plot_coefficients(lin_svm, vectorizer.get_feature_names())

SKLearn uses what's known as a pipeline. Instead of having to declare each of these objects on their own and passing them into each other, we can just create one object with all the necessary options specified and then use that to run the algorithm. For each pipeline below, we specify the vector to be the CountVectorizer object we have defined above, set it to use tfidf, and then specify the classifier that we want to use.

Below, we create a separate pipeline for Random Forest, a Bagged Decision Tree, and Multinomial Logistic Regression. We then append the results to the dataframe that we've already created.



In [39]:

    
# random_forest = Pipeline([('vect', vectorizer),
#                         ('tfidf', TfidfTransformer()), 
#                         ('clf', RandomForestClassifier())])
# random_forest.set_params(clf__n_estimators=100, clf__criterion='entropy').fit(train_reviews, train_ratings)
# output['random_forest'] = random_forest.predict(test_reviews)

# output['Accurate'] = np.where(output['Rating'] == output['random_forest'], 1, 0)
# accurate_percentage = float(sum(output['Accurate']))/float(len(output))
# print accurate_percentage

# bagged_dt = Pipeline([('vect', vectorizer),
#                         ('tfidf', TfidfTransformer()), 
#                         ('clf', BaggingClassifier())])
# bagged_dt.set_params(clf__n_estimators=100, clf__n_jobs=1).fit(train_reviews, train_ratings)
# output['bagged_dt'] = bagged_dt.predict(test_reviews)

# output['Accurate'] = np.where(output['Rating'] == output['bagged_dt'], 1, 0)
# accurate_percentage = float(sum(output['Accurate']))/float(len(output))
# print accurate_percentage

multi_logit = Pipeline([('vect', vectorizer),
                        ('tfidf', TfidfTransformer()), 
                        ('clf', MultinomialNB())])
multi_logit.set_params(clf__alpha=1, clf__fit_prior = True, clf__class_prior = None).fit(train_reviews, train_ratings)
output['multi_logit'] = multi_logit.predict(test_reviews)

output['Accurate'] = np.where(output['Rating'] == output['multi_logit'], 1, 0)
accurate_percentage = float(sum(output['Accurate']))/float(len(output))
print accurate_percentage



In [40]:

    
random_forest = Pipeline([('vect', vectorizer),
                        ('tfidf', TfidfTransformer()), 
                        ('clf', RandomForestClassifier())])
random_forest.set_params(clf__n_estimators=100, clf__criterion='entropy').fit(train_reviews, train_ratings)
output['random_forest'] = random_forest.predict(test_reviews)

output['Accurate'] = np.where(output['Rating'] == output['random_forest'], 1, 0)
accurate_percentage = float(sum(output['Accurate']))/float(len(output))
print accurate_percentage

Test results using all of the states

0.5383 from Naive TF-IDF Linear SVM
0.4567 from Naive TF-IDF Linear SVM using Harvard-IV dictionary
0.5241 from Naive TF-IDF Bagged DT using 100 estimators
0.496 from Naive TF-IDF Bagged DT using 100 estimators and Harvard-IV dictionary
0.5156 from Naive TF-IDF RandomForest and Harvard-IV dictionary
0.53 from Naive TF-IDF RF
0.458 from Naive TF-IDF SVM

As you can see, none of the above classifiers performs significantly better than a fair coin toss. This is most likely due to the heavily skewed distribution of review ratings. There are many reviews that receive 4 or 5 stars, therefore it is likely that the language associated with each review is being confused with each other. We can confirm this by looking at the "confusion matrix" of our predictions.



In [70]:

    
print metrics.confusion_matrix(test_ratings, bagged_dt.predict(test_reviews), labels = [1, 2, 3, 4, 5])









    



[[ 440   62   58  101   95]
 [ 137   74   96  141   79]
 [  81   58  170  368  143]
 [  70   36  105  751  560]
 [  75   25   47  582 1648]]

Each row and column corresponds to a rating number. For example, element (1,1) is the number of 1 star reviews that were correctly classified. Element (1,2) is the number of 1 star reviews that were incorrectly classified as 2 stars. Therefore, the sum of the diagonal represents the total number of correctly classified reviews. As you can see, the bagged decision tree classifier is classifying many four starred reviews as five starred reviews and vice versa.

This indicates that we can improve our results by using more aggregated categories. For example, we can call all four and five star reviews as "good" and all other review ratings as "bad".



In [12]:

    
for review in reviews_json[reviews_json.keys()[0]]:
    print type(review['date'])
    break









    



<type 'unicode'>



In [16]:

    
latitude_list = []
longitude_list = []
stars_list = []
count_list = []
state_list = []

for biz in biz_json:
    stars_list.append(biz['stars'])
    latitude_list.append(biz['latitude'])
    longitude_list.append(biz['longitude'])
    count_list.append(biz['review_count'])
    state_list.append(biz['state'])



In [17]:

    
biz_df = pd.DataFrame({'ratings':stars_list, 'latitude':latitude_list, 'longitude': longitude_list, 'review_count': count_list, 'state':state_list})

We draw a heat map for each state below. Longitude is on the Y axis and Latitude is on the X axis. The color coding is as follows:

Red = Rating of 1
Orange = Rating of 2
Yellow = Rating of 3
Green = Rating of 4
Blue = Rating of 5



In [109]:

    
states = [u'OH', u'NC', u'WI', u'IL', u'AZ', u'NV']
cmap, norm = mpl.colors.from_levels_and_colors([1, 2, 3, 4, 5], ['red', 'orange', 'yellow', 'green', 'blue'], extend = 'max')
for state in states:
    state_df = biz_df[biz_df.state == state]
    state_df_filt = state_df[(np.abs(state_df.longitude-state_df.longitude.mean()) <= 2*state_df.longitude.std()) \
                             & (np.abs(state_df.latitude-state_df.latitude.mean()) <= 2*state_df.latitude.std())]
    plt.ylim(min(state_df_filt.latitude), max(state_df_filt.latitude))
    plt.xlim(min(state_df_filt.longitude), max(state_df_filt.longitude))
    
    plt.scatter(state_df_filt.longitude, state_df_filt.latitude, c=state_df_filt.ratings, cmap=cmap, norm=norm)
    plt.show()
    print state

We run the following linear regression model for each of the states:

$Rating = \beta_{1} Longitude + \beta_{2} Latitude + \beta_{3} Num of Reviews + \epsilon$



In [126]:

    
for state in states:
    state_df = biz_df[biz_df.state == state]
    state_df_filt = state_df[(np.abs(state_df.longitude-state_df.longitude.mean()) <= 2*state_df.longitude.std()) \
                             & (np.abs(state_df.latitude-state_df.latitude.mean()) <= 2*state_df.latitude.std())]
    state_df_filt['longitude'] = (state_df_filt.longitude - state_df.longitude.mean())/state_df.longitude.std()
    state_df_filt['latitude'] = (state_df_filt.latitude - state_df.latitude.mean())/state_df.latitude.std()
    state_df_filt['review_count'] = (state_df_filt.review_count - state_df.review_count.mean())/state_df.review_count.std()
    
    X = np.column_stack((state_df_filt.longitude, state_df_filt.latitude, state_df_filt.review_count))
    y = state_df_filt.ratings
    
    est = sm.OLS(y, X)
    est2 = est.fit()
    print(est2.summary())
    print state









    



/Users/robertsonwang/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/robertsonwang/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/robertsonwang/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy






    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                ratings   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     6.902
Date:                Sat, 01 Apr 2017   Prob (F-statistic):           0.000124
Time:                        15:43:22   Log-Likelihood:                -10721.
No. Observations:                4008   AIC:                         2.145e+04
Df Residuals:                    4005   BIC:                         2.147e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1             0.1010      0.065      1.559      0.119        -0.026     0.228
x2             0.1319      0.064      2.075      0.038         0.007     0.257
x3             0.1908      0.054      3.506      0.000         0.084     0.297
==============================================================================
Omnibus:                       98.577   Durbin-Watson:                   0.101
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              105.481
Skew:                          -0.397   Prob(JB):                     1.24e-23
Kurtosis:                       3.048   Cond. No.                         1.28
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
OH
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                ratings   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     6.663
Date:                Sat, 01 Apr 2017   Prob (F-statistic):           0.000175
Time:                        15:43:23   Log-Likelihood:                -8945.7
No. Observations:                3359   AIC:                         1.790e+04
Df Residuals:                    3356   BIC:                         1.792e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1             0.2848      0.098      2.912      0.004         0.093     0.477
x2            -0.1459      0.081     -1.801      0.072        -0.305     0.013
x3             0.1752      0.060      2.925      0.003         0.058     0.293
==============================================================================
Omnibus:                       60.398   Durbin-Watson:                   0.101
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               63.608
Skew:                          -0.321   Prob(JB):                     1.54e-14
Kurtosis:                       3.206   Cond. No.                         1.65
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
NC
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                ratings   R-squared:                       0.008
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     3.803
Date:                Sat, 01 Apr 2017   Prob (F-statistic):            0.00990
Time:                        15:43:23   Log-Likelihood:                -3685.5
No. Observations:                1377   AIC:                             7377.
Df Residuals:                    1374   BIC:                             7393.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1             0.3710      0.144      2.576      0.010         0.088     0.654
x2            -0.3303      0.201     -1.646      0.100        -0.724     0.063
x3             0.1821      0.094      1.930      0.054        -0.003     0.367
==============================================================================
Omnibus:                       42.826   Durbin-Watson:                   0.097
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               48.163
Skew:                          -0.397   Prob(JB):                     3.48e-11
Kurtosis:                       3.456   Cond. No.                         2.17
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
WI
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                ratings   R-squared:                       0.044
Model:                            OLS   Adj. R-squared:                  0.038
Method:                 Least Squares   F-statistic:                     7.434
Date:                Sat, 01 Apr 2017   Prob (F-statistic):           7.09e-05
Time:                        15:43:23   Log-Likelihood:                -1275.8
No. Observations:                 489   AIC:                             2558.
Df Residuals:                     486   BIC:                             2570.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1             1.5024      0.409      3.671      0.000         0.698     2.306
x2             0.4527      0.538      0.842      0.400        -0.604     1.510
x3             0.3672      0.141      2.596      0.010         0.089     0.645
==============================================================================
Omnibus:                       11.582   Durbin-Watson:                   0.155
Prob(Omnibus):                  0.003   Jarque-Bera (JB):               19.541
Skew:                           0.119   Prob(JB):                     5.71e-05
Kurtosis:                       3.950   Cond. No.                         3.83
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
IL
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                ratings   R-squared:                       0.016
Model:                            OLS   Adj. R-squared:                  0.016
Method:                 Least Squares   F-statistic:                     48.68
Date:                Sat, 01 Apr 2017   Prob (F-statistic):           3.37e-31
Time:                        15:43:24   Log-Likelihood:                -23756.
No. Observations:                8906   AIC:                         4.752e+04
Df Residuals:                    8903   BIC:                         4.754e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1             0.4049      0.049      8.260      0.000         0.309     0.501
x2            -0.0170      0.046     -0.366      0.714        -0.108     0.074
x3             0.2403      0.036      6.721      0.000         0.170     0.310
==============================================================================
Omnibus:                       29.514   Durbin-Watson:                   0.113
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               29.777
Skew:                          -0.141   Prob(JB):                     3.42e-07
Kurtosis:                       2.984   Cond. No.                         1.74
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
AZ
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                ratings   R-squared:                       0.251
Model:                            OLS   Adj. R-squared:                  0.251
Method:                 Least Squares   F-statistic:                     688.9
Date:                Sat, 01 Apr 2017   Prob (F-statistic):               0.00
Time:                        15:43:24   Log-Likelihood:                -15634.
No. Observations:                6165   AIC:                         3.127e+04
Df Residuals:                    6162   BIC:                         3.129e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1           -58.3450      1.312    -44.469      0.000       -60.917   -55.773
x2            -0.9146      0.048    -19.137      0.000        -1.008    -0.821
x3             0.0543      0.038      1.428      0.153        -0.020     0.129
==============================================================================
Omnibus:                       30.257   Durbin-Watson:                   0.559
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               39.482
Skew:                           0.067   Prob(JB):                     2.67e-09
Kurtosis:                       3.369   Cond. No.                         34.6
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
NV