Created by Judit Acs
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from keras.callbacks import EarlyStopping
In [2]:
df = pd.read_csv("data/stackoverflow/survey_results_public.csv")
df.head()
Out[2]:
Most answers are categorical:
In [3]:
df.groupby("ProgramHobby").size()
Out[3]:
The table has 154 columns but many values are empty. These columns have the most non-empty values:
In [4]:
df.count().sort_values(ascending=False)[:20]
Out[4]:
In [5]:
feature_cols = ["Professional", "EmploymentStatus", "FormalEducation", "ProgramHobby", "HomeRemote",
"IDE", "MajorUndergrad"]
# I do not include JobSatisfaction because it's too similar to the target variable
# Uncomment this line to include it in the features
# feature_cols.append("JobSatisfaction")
target_col = "CareerSatisfaction"
We filter all rows that do not define every feature column.
In [6]:
condition = (df[target_col].notnull())
for c in feature_cols:
condition &= (df[c].notnull())
df = df[condition]
len(df)
Out[6]:
CareerSatisfaction values are distributed unevenly, so we may want to include fewer samples from very large classes. Uncomment the second to last line to filter these these samples:
In [7]:
minval = df.groupby(target_col).size().min() * 2
filt = None
for grouper, group in df.groupby(target_col):
size = min(minval, len(group))
if filt is None:
filt = group.sample(size)
else:
filt = pd.concat((filt, group.sample(size)), axis=0)
#df = filt
len(df)
Out[7]:
In [8]:
X = None
for col in feature_cols:
mtx = LabelEncoder().fit_transform(df[col])
maxval = np.max(mtx)
feat_mtx = np.zeros((mtx.shape[0], maxval+1))
feat_mtx[np.arange(feat_mtx.shape[0]), mtx] = 1
if X is None:
X = feat_mtx
else:
X = np.concatenate((X, feat_mtx), axis=1)
In [9]:
y = df[target_col].as_matrix() / 10
In [10]:
rand_mtx = np.random.permutation(X.shape[0])
train_split = int(X.shape[0] * 0.9)
train_indices = rand_mtx[:train_split]
test_indices = rand_mtx[train_split:]
X_train = X[train_indices]
X_test = X[test_indices]
y_train = y[train_indices]
y_test = y[test_indices]
In [11]:
input_layer = Input(batch_shape=(None, X.shape[1]))
layer = Dense(100, activation="sigmoid")(input_layer)
layer = Dropout(.2)(layer)
layer = Dense(100, activation="sigmoid")(input_layer)
layer = Dropout(.2)(layer)
layer = Dense(100, activation="sigmoid")(input_layer)
layer = Dropout(.2)(layer)
layer = Dense(1, activation="sigmoid")(layer)
model = Model(inputs=input_layer, outputs=layer)
model.compile("rmsprop", loss="mse")
In [12]:
ea = EarlyStopping(patience=2)
model.fit(X_train, y_train, epochs=100, batch_size=512,
validation_split=.2, callbacks=[ea])
Out[12]:
In [13]:
pred = model.predict(X_test)
In [14]:
np.sqrt(np.mean(pred - y_test) ** 2)
Out[14]:
In [15]:
np.sqrt(np.mean(.5*np.ones(y_test.shape[0]) - y_test) ** 2)
Out[15]:
In [16]:
prediction = pd.DataFrame({'gold': y_test, 'prediction': pred[:, 0]})
prediction['diff'] = prediction.gold - prediction.prediction
prediction.hist(['gold', 'prediction'], bins=11)
Out[16]:
In [17]:
prediction.sample(20).plot(y=['gold', 'prediction'], kind='bar')
Out[17]:
In [18]:
df[['JobSatisfaction', 'CareerSatisfaction']].sample(20).plot(kind='bar')
Out[18]:
In [19]:
(df['JobSatisfaction'] - df['CareerSatisfaction']).hist(bins=20)
Out[19]: