Predicting missing salaries and new email connections from a company's email network
Network Analysis. Supervised Learning. Regression (Salary prediction) and Classification (New connections prediction)
Data from Applied Social Network Analysis in Python | Coursera:
net_emails.txt
: network where each node corresponds to a person at the company, and each edge indicates that at least one email has been sent between two people.
The network also contains the node attributes Department (name) and ManagementSalary (1 = Receiving a management salary)
net_future_connections.csv
: future conections of pair of nodes currently unconnected (1 = an edge between those two nodes will exist in the future)
In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import networkx as nx
import pickle
import helper
In [2]:
graph = nx.read_gpickle('data/net_emails.txt')
print(nx.info(graph))
In [3]:
graph.nodes(data=True)[:3]
Out[3]:
In [4]:
# Dataframe with node attributes
df = pd.DataFrame(index=graph.nodes()) # df: complete df
attributes = [k for k in graph.nodes(data=True)[0][1]]
for a in attributes:
df[a] = pd.Series(nx.get_node_attributes(graph, a))
# node features
df['clustering'] = pd.Series(nx.clustering(graph))
df['degree'] = pd.Series(graph.degree())
df.head()
Out[4]:
In [5]:
target = ['ManagementSalary']
features = [col for col in df if col not in target]
print(df[target].squeeze().value_counts(dropna=False))
In [6]:
n_rows_original = df.shape[0]
df_pred = df[df['ManagementSalary'].isnull()]
df = df[(df['ManagementSalary'] == 0) | (df['ManagementSalary'] == 1)]
assert df.shape[0], df_pred.shape[0] == n_rows_original
In [7]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(
df, test_size=0.2, stratify=df[target], random_state=0)
del df
df_pred
: prediction set (no labels)
df_train
: training_set
df_test
: test_set
In [8]:
import helper
cat = ['Department', 'ManagementSalary']
num = ['clustering', 'degree']
df_train = helper.classify_data(df_train, target, categorical=cat)
pd.DataFrame(dict(df_train.dtypes), index=["Type"])[df_train.columns].head()
Out[8]:
In [9]:
# df_train, dict_categories = helper.remove_categories(df_train, target, ratio=0.01, show=True,
# dict_categories=None)
In [10]:
df_train[num].describe(percentiles=[0.5])
Out[10]:
In [11]:
helper.show_numerical(df_train[num], kde=True)
In [12]:
helper.show_target_vs_numerical(df_train, target, jitter=0.2, fit_reg=False, point_size=100)
In [13]:
helper.show_correlation(df_train, target)
In [14]:
df_train[cat].describe()
Out[14]:
In [15]:
helper.show_categorical(df_train[cat], target, sharey=True)
In [16]:
helper.show_target_vs_categorical(df_train, target)
In [17]:
high_missing = helper.missing(df_train, limit=0.4)
#helper.fill_simple(df_train, target, missing_categorical=999, inplace=True)
In [18]:
copy_df = df_train.copy() # checkpoint
del(df_train)
In [19]:
df_train = copy_df.copy() # Restore checkpoint
data = df_train.copy()
# from now on use data instead of df
In [20]:
data, scale_param = helper.scale(data)
In [21]:
# features only; target encoded later
data, dict_dummies = helper.replace_by_dummies(data, target)
# save features order for tests and predictions
model_features = [f for f in data if f not in target]
data.head(3)
Out[21]:
In [22]:
def validation_split(data, val_size=0.15):
train, val = train_test_split(
data, test_size=val_size, random_state=0, shuffle=True, stratify=data[target])
# Separate the data into features and target (x=features, y=target)
x_train, y_train = train.drop(target, axis=1).values, train[target].values
x_val, y_val = val.drop(target, axis=1).values, val[target].values
return x_train, y_train, x_val, y_val
x_train, y_train, x_val, y_val = validation_split(data, val_size=0.2)
In [23]:
import keras
def one_hot_output(y_train, y_val):
num_classes = len(np.unique(y_train))
y_train = keras.utils.to_categorical(y_train, num_classes)
y_val = keras.utils.to_categorical(y_val, num_classes)
return y_train, y_val
y_train, y_val = one_hot_output(y_train, y_val)
In [24]:
print("train size \t X:{} \t Y:{}".format(x_train.shape, y_train.shape))
print("val size \t X:{} \t Y:{}".format(x_val.shape, y_val.shape))
In [25]:
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout
from keras import regularizers
def build_nn_binary_classification(input_size, output_size, summary=False):
input_nodes = input_size // 8
model = Sequential()
model.add(
Dense(input_nodes, input_dim=input_size, kernel_regularizer=regularizers.l2(0.001)))
model.add(
Dense(output_size, activation='softmax', kernel_regularizer=regularizers.l2(0.001)))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
if summary:
model.summary()
return model
build_nn = build_nn_binary_classification
In [26]:
import os
from time import time
def train_nn(model, x_train, y_train, validation_data=None, path=False, show=True):
"""
Train the neural network model. If no validation_datais provided, a split for validation
will be used
"""
if show:
print('Training ....')
callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=0, verbose=0)]
t0 = time()
history = model.fit(
x_train,
y_train,
epochs=200,
batch_size=64,
verbose=0,
validation_data=validation_data,
# class_weight=cw, # worse results
callbacks=callbacks)
if show:
print("time: \t {:.1f} s".format(time() - t0))
helper.show_training(history)
if path:
model.save(path)
print("\nModel saved at", path)
return history
model = None
model = build_nn_binary_classification(x_train.shape[1], y_train.shape[1], summary=True)
train_nn(model, x_train, y_train, validation_data=(x_val, y_val));
In [27]:
data_test = helper.classify_data(df_test, target, categorical=cat)
data_test, _ = helper.scale(data_test, scale_param)
data_test, _ = helper.replace_by_dummies(data_test, target, dict_dummies)
data_test = data_test[model_features+target] # sort columns to match training features order
x_test, y_test = data_test.drop(target, axis=1).values, data_test[target].values
y_test = keras.utils.to_categorical(y_test, 2)
In [28]:
from sklearn.metrics import roc_auc_score
score = model.evaluate(x_test, y_test, verbose=0)
print("\nNeural Network Accuracy: {:.3f}\n".format(score[1]))
y_pred = model.predict(x_test)
print('Neural Network ROC AUC: {:.3f} \n'.format(roc_auc_score(y_test, y_pred)))
In [29]:
y_train = y_train[:,1]
y_test = y_test[:,1]
In [30]:
# from sklearn.utils import class_weight
# y_plain = np.ravel(y_train)
# cw = class_weight.compute_class_weight('balanced', np.unique(y_plain), y_plain)
# cw = {idx : value for idx, value in enumerate(cw)}
In [31]:
helper.ml_classification(x_train, y_train, x_test, y_test, cross_validation=False)
In [32]:
del df_train, df_test, df_pred
In [33]:
df = pd.read_csv('data/net_future_connections.csv', index_col=0, converters={0: eval})
df.head(6)
Out[33]:
In [34]:
df['Common Neighbors'] = df.index.map(lambda city: len(list(nx.common_neighbors(graph, city[0], city[1]))))
df['Jaccard Coefficient'] = [i[2] for i in nx.jaccard_coefficient(graph, df.index)]
df['ResourceWarningurce Allocation'] = [i[2] for i in nx.resource_allocation_index(graph, df.index)]
df['Adamic-Adar Index'] = [i[2] for i in nx.adamic_adar_index(graph, df.index)]
df['Preferential Attachment'] = [i[2] for i in nx.preferential_attachment(graph, df.index)]
df.head()
Out[34]:
In [35]:
target = ['Future Connection']
features = [col for col in df if col not in target]
df['Future Connection'].value_counts(dropna=False)
Out[35]:
In [36]:
n_rows_original = df.shape[0]
df_pred = df[df['Future Connection'].isnull()]
df = df[(df['Future Connection']==0) | (df['Future Connection']==1)]
assert df.shape[0], df_pred.shape[0] == n_rows_original
In [37]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(
df, test_size=0.2, stratify=df[target], random_state=0)
del df
df_pred
: prediction set (no labels)
df_train
: training_set
df_test
: test_set
In [38]:
import helper
cat = ['Future Connection']
num = features # all the features are numerical here
df_train = helper.classify_data(df_train, target, categorical=cat)
pd.DataFrame(dict(df_train.dtypes), index=["Type"])[df_train.columns].head()
Out[38]:
In [39]:
df_train[num].describe(percentiles=[0.5])
Out[39]:
In [40]:
helper.show_numerical(df_train, kde=True)
In [41]:
helper.show_target_vs_numerical(df_train, target, jitter=0.2, fit_reg=False, point_size=10)
In [42]:
helper.show_correlation(df_train, target)
In [43]:
high_missing = helper.missing(df_train, limit=0.4)
In [44]:
copy_df = df_train.copy() # checkpoint
del(df_train)
In [45]:
df_train = copy_df.copy() # Restore checkpoint
data = df_train.copy()
# from now on use data instead of df
In [46]:
data, scale_param = helper.scale(data)
model_features = [f for f in data if f not in target]
In [47]:
def validation_split(data, val_size=0.15):
train, val = train_test_split(
data, test_size=val_size, random_state=0, shuffle=True, stratify=data[target])
# Separate the data into features and target (x=features, y=target)
x_train, y_train = train.drop(target, axis=1).values, train[target].values
x_val, y_val = val.drop(target, axis=1).values, val[target].values
return x_train, y_train, x_val, y_val
x_train, y_train, x_val, y_val = validation_split(data, val_size=0.2)
In [48]:
import keras
def one_hot_output(y_train, y_val):
num_classes = len(np.unique(y_train))
y_train = keras.utils.to_categorical(y_train, num_classes)
y_val = keras.utils.to_categorical(y_val, num_classes)
return y_train, y_val
y_train, y_val = one_hot_output(y_train, y_val)
In [49]:
print("train size \t X:{} \t Y:{}".format(x_train.shape, y_train.shape))
print("val size \t X:{} \t Y:{}".format(x_val.shape, y_val.shape))
In [50]:
def build_nn_binary_classification(input_size, output_size, summary=False):
input_nodes = input_size
model = Sequential()
model.add(
Dense(input_nodes, input_dim=input_size, kernel_regularizer=regularizers.l2(0.0001)))
model.add(
Dense(output_size, activation='softmax', kernel_regularizer=regularizers.l2(0.0001)))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
if summary:
model.summary()
return model
build_nn = build_nn_binary_classification
In [51]:
def train_nn(model, x_train, y_train, validation_data=None, path=False, show=True):
"""
Train the neural network model. If no validation_data is provided, a split for validation
will be used
"""
if show:
print('Training ....')
callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=0, verbose=0)]
t0 = time()
history = model.fit(
x_train,
y_train,
epochs=20,
batch_size=1024,
verbose=0,
validation_data = validation_data,
callbacks=callbacks)
if show:
print("time: \t {:.1f} s".format(time() - t0))
helper.show_training(history)
if path:
model.save(path)
print("\nModel saved at", path)
return history
model = None
model = build_nn_binary_classification(x_train.shape[1], y_train.shape[1], summary=True)
train_nn(model, x_train, y_train, validation_data=(x_val, y_val));
In [52]:
data_test = helper.classify_data(df_test, target, categorical=cat)
data_test, _ = helper.scale(data_test, scale_param)
data_test = data_test[model_features+target] # sort columns to match training features order
x_test, y_test = data_test.drop(target, axis=1).values, data_test[target].values
y_test = keras.utils.to_categorical(y_test, 2)
In [53]:
from sklearn.metrics import roc_auc_score
score = model.evaluate(x_test, y_test, verbose=0)
print("\nNeural Network Accuracy: {:.3f}\n".format(score[1]))
y_pred = model.predict(x_test)
print('Neural Network ROC AUC: {:.3f} \n'.format(roc_auc_score(y_test, y_pred)))
In [54]:
y_train = y_train[:,1]
y_test = y_test[:,1]
In [55]:
# from sklearn.ensemble import RandomForestClassifier
# clf=None
# clf = RandomForestClassifier()
# clf.fit(x_train, np.ravel(y_train))
# print("\nRandom Forest Accuracy: {:.3f}\n".format(clf.score(x_train, y_train)))
# y_pred = clf.predict_proba(x_test)
# print('Random Forest ROC_AUC: {:.3f}'.format(roc_auc_score(y_test, y_pred[:,1])))
In [56]:
helper.ml_classification(x_train, y_train, x_test, y_test, cross_validation=False)