In [350]:
import pandas as pd
import os, sys
from collections import defaultdict
import numpy as np
from sklearn.svm import LinearSVC
from sklearn import preprocessing
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
In [415]:
network = pd.read_csv('../data/network/vertex_attributes.csv')
network.shape
Out[415]:
In [466]:
cols = [u'name', u'degree', u'in', u'out', u'core', u'core-in', u'core-out', u'closeness', u'betweenness', u'eigen_centrality', u'pagerank', u'neighborhood', u'avg_neighborhood', u'authority', u'hub', u'clustering_coef']
network = network[cols]
network['name'] = network['name'].astype(str)
network['percent-out'] = network['out'].div(network['degree'])
# # network['name'] = [col.split('@')[0] for col in network['name']]
# network.head()
In [ ]:
VERTICES: 1646
EDGES: 4924
GLOBAL CC 0.0418348603048
AVG PATH LENGTH 2.74013916003
CLIQUES 138340
CLIQUE NO 14
DIAMETER 22.0
RECIPROCITY 0.245735174655
LARGEST CLUSTER SIZE 378 Vertices, 2708 Edges
ONLY ASKED QUETIONS: 464
ONLY ANSWER ANSWERED: 764
ASKED AND ANSWER: 415
In [423]:
cols = [u'name', u'degree', u'in', u'out', u'core', u'core-in', u'core-out', u'closeness', u'betweenness', u'eigen_centrality', u'pagerank', u'neighborhood', u'avg_neighborhood', u'authority', u'hub', u'clustering_coef']
network = network[cols]
network.apply(pd.to_numeric, errors='ignore')
print network.describe().round(4).loc[['mean', 'std', 'min', 'max']].T
In [412]:
plotDegreeDistribution(network['degree'])
In [424]:
title = 'Degree Distributions'
degrees = network[['degree', 'in', 'out']]
degrees.plot(kind='hist', subplots=True, layout=(1,3), title=title, log=True, figsize=(20, 4), bins=45, edgecolor='w')
# plt.plot()
plt.show()
In [459]:
only_in = network['in'] > 0
no_out = network['out'] == 0
only_out = network['out'] > 0
no_in = network['in'] == 0
ask = network['in'] > 0
answer = network['out'] > 0
only_ask = network[(only_in & no_out)]
only_answer = network[(only_out & no_in)]
ask_answer = network[(ask & answer)]
print "ONLY ASK:", only_ask.shape[0]
print "ONLY ANSWER:", only_answer.shape[0]
print "ASK AND ANSWER:", ask_answer.shape[0]
In [572]:
gt_three = network[network['degree'] >= 6]#['degree'].describe().loc[['count']]
gt_three['percent-out'].hist()
Out[572]:
In [550]:
gt_three.shape
Out[550]:
In [363]:
edges = pd.read_csv('../data/network/plots_edgelist.csv')
plots_org = edges[edges['fid'] == 'plots-organizers']
organizers = pd.concat([plots_org['source'], plots_org['target']])
network['org'] = network['name'].isin(organizers)
# network.head()
In [585]:
gt_three['percent-out'].describe(percentiles=[.33, .5, .66])
Out[585]:
In [587]:
gt_three['ans'] = gt_three['percent-out'] > .55
gt_three['ask'] = gt_three['percent-out'] < .33
gt_three['both'] = (gt_three['ans'] == False) & (gt_three['ask'] == False)
gt_three['org'] = gt_three['name'].isin(organizers)
In [588]:
gt_feature_cols = gt_three.columns[1:-4]
gt_labels_cols = gt_three.columns[-4:]
gt_features = gt_three[gt_feature_cols]
gt_labels = gt_three[gt_labels_cols]
print gt_features.shape, gt_labels.shape
In [589]:
gt_min_max_scaler = preprocessing.MinMaxScaler()
gt_feat_scaled = min_max_scaler.fit_transform(gt_features)
gt_feat_norm = pd.DataFrame(gt_feat_scaled, columns=gt_features.columns)
# gt_feat_norm.head()
In [591]:
gt_ask_clf = LinearSVC(random_state=0)
gt_ask_clf.fit(gt_feat_norm, gt_labels['ask'])
gt_coef = gt_ask_clf.coef_.ravel()
gt_importance = pd.DataFrame(gt_coef, index=gt_feat_norm.columns, columns=['value'])#, columns=features.columns)
gt_pos_class = gt_importance[gt_importance['value'] > 0]
gt_neg_class = gt_importance[gt_importance['value'] < 0]
print 'ACCURACY:', gt_ask_clf.score(gt_feat_norm, gt_labels['ask'])
print ''
print gt_pos_class.sort_values(by='value', ascending=False)
print ''
print gt_neg_class.sort_values(by='value', ascending=True)
In [621]:
ask_cols = ['in', 'closeness', 'eigen_centrality', 'pagerank', 'authority']
ask = gt_three[gt_three['ask'] == True]
ask = ask[ask_cols]
ans = gt_three[gt_three['ans'] == True]
ans = ans[ask_cols]
both = gt_three[gt_three['both'] == True]
both = both[ask_cols]
overall = gt_three[ask_cols]
print 'ASKERS'
# ask['avg_neighborhood'] = normalize_column(ask['avg_neighborhood'])
print ask.describe().round(4).loc[['mean', 'std']]
print ''
print 'ANSWERERS'
# not_ask['avg_neighborhood'] = normalize_column(not_ask['avg_neighborhood'])
print ans.describe().round(4).loc[['mean', 'std']]
print ''
print 'BOTH'
# not_ask['avg_neighborhood'] = normalize_column(not_ask['avg_neighborhood'])
print both.describe().round(4).loc[['mean', 'std']]
print ''
print 'OVERALL'
# ask_total['avg_neighborhood'] = normalize_column(ask_total['avg_neighborhood'])
print overall.describe().round(4).loc[['mean', 'std']]
In [618]:
neg_cols = ['out', 'hub', 'core', 'clustering_coef', 'betweenness', 'avg_neighborhood', 'degree', 'neighborhood']
neg_cols_abv = ['out', 'hub', 'core', 'clust', 'betwn', 'avg_ngh', 'deg', 'neigh']
ask = gt_three[gt_three['ask'] == True]
ask = ask[neg_cols]
ask.columns = neg_cols_abv
ans = gt_three[gt_three['ans'] == True]
ans = ans[neg_cols]
ans.columns = neg_cols_abv
both = gt_three[gt_three['both'] == True]
both = both[neg_cols]
both.columns = neg_cols_abv
overall = gt_three[neg_cols]
overall.columns = neg_cols_abv
print 'ASKERS'
ask['avg_ngh'] = normalize_column(ask['avg_ngh'])
ask['betwn'] = normalize_column(ask['betwn'])
print ask.describe().round(4).loc[['mean', 'std']]
print ''
print 'ANSWERERS'
ans['avg_ngh'] = normalize_column(ans['avg_ngh'])
ans['betwn'] = normalize_column(ans['betwn'])
print ans.describe().round(4).loc[['mean', 'std']]
print ''
print 'BOTH'
both['avg_ngh'] = normalize_column(both['avg_ngh'])
both['betwn'] = normalize_column(both['betwn'])
print both.describe().round(4).loc[['mean', 'std']]
print ''
print 'OVERALL'
overall['avg_ngh'] = normalize_column(overall['avg_ngh'])
overall['betwn'] = normalize_column(overall['betwn'])
print overall.describe().round(4).loc[['mean', 'std']]
In [596]:
gt_ans_clf = LinearSVC(random_state=0)
gt_ans_clf.fit(gt_feat_norm, gt_labels['ans'])
gt_coef = gt_ans_clf.coef_.ravel()
gt_importance = pd.DataFrame(gt_coef, index=gt_feat_norm.columns, columns=['value'])#, columns=features.columns)
gt_pos_class = gt_importance[gt_importance['value'] > 0]
gt_neg_class = gt_importance[gt_importance['value'] < 0]
print 'ACCURACY:', gt_ans_clf.score(gt_feat_norm, gt_labels['ans'])
print ''
print gt_pos_class.sort_values(by='value', ascending=False)
print ''
print gt_neg_class.sort_values(by='value', ascending=True)
In [629]:
abv_cols = ['out', 'hub', 'clust', 'closeness', 'between', 'neighbrhd', 'degree']
ans_cols = ['out', 'hub', 'clustering_coef', 'closeness', 'betweenness', 'neighborhood', 'degree']
ask = gt_three[gt_three['ask'] == True]
ask = ask[ans_cols]
ask.columns = abv_cols
ans = gt_three[gt_three['ans'] == True]
ans = ans[ans_cols]
ans.columns = abv_cols
both = gt_three[gt_three['both'] == True]
both = both[ans_cols]
both.columns = abv_cols
overall = gt_three[ans_cols]
overall.columns = abv_cols
print 'ASKERS'
ask['between'] = normalize_column(ask['between'])
print ask.describe().round(4).loc[['mean', 'std']]
print ''
print 'ANSWERERS'
ans['between'] = normalize_column(ans['between'])
print ans.describe().round(4).loc[['mean', 'std']]
print ''
print 'BOTH'
both['between'] = normalize_column(both['between'])
print both.describe().round(4).loc[['mean', 'std']]
print ''
print 'OVERALL'
overall['between'] = normalize_column(ask['between'])
print overall.describe().round(4).loc[['mean', 'std']]
In [623]:
neg_abv_cols = ['in', 'eigen', 'pagerank', 'authority', 'avg_neigh', 'core']
neg_ans_cols = ['in', 'eigen_centrality', 'pagerank', 'authority', 'avg_neighborhood', 'core']
ask = gt_three[gt_three['ask'] == True]
ask = ask[neg_ans_cols]
ask.columns = neg_abv_cols
ans = gt_three[gt_three['ans'] == True]
ans = ans[neg_ans_cols]
ans.columns = neg_abv_cols
both = gt_three[gt_three['both'] == True]
both = both[neg_ans_cols]
both.columns = neg_abv_cols
overall = gt_three[neg_ans_cols]
overall.columns = neg_abv_cols
print 'ASKERS'
ask['avg_neigh'] = normalize_column(ask['avg_neigh'])
print ask.describe().round(4).loc[['mean', 'std']]
print ''
print 'ANSWERERS'
ans['avg_neigh'] = normalize_column(ans['avg_neigh'])
print ans.describe().round(4).loc[['mean', 'std']]
print ''
print 'BOTH'
both['avg_neigh'] = normalize_column(both['avg_neigh'])
print both.describe().round(4).loc[['mean', 'std']]
print ''
print 'OVERALL'
overall['avg_neigh'] = normalize_column(overall['avg_neigh'])
print overall.describe().round(4).loc[['mean', 'std']]
In [581]:
both_clf_clf = LinearSVC(random_state=0)
both_clf_clf.fit(gt_feat_norm, gt_labels['both'])
gt_coef = both_clf_clf.coef_.ravel()
gt_importance = pd.DataFrame(gt_coef, index=gt_feat_norm.columns, columns=['value'])#, columns=features.columns)
gt_pos_class = gt_importance[gt_importance['value'] > 0]
gt_neg_class = gt_importance[gt_importance['value'] < 0]
print 'ACCURACY:', gt_ans_clf.score(gt_feat_norm, gt_labels['ans'])
print ''
print gt_pos_class.sort_values(by='value', ascending=False)
print ''
print gt_neg_class.sort_values(by='value', ascending=True)
In [625]:
abv_cols = ['core', 'eigen', 'avg_neigh', 'pagerank', 'authority', 'in']
both_cols = ['core', 'eigen_centrality', 'avg_neighborhood', 'pagerank', 'authority', 'in']
ask = gt_three[gt_three['ask'] == True]
ask = ask[both_cols]
ask.columns = abv_cols
ans = gt_three[gt_three['ans'] == True]
ans = ans[both_cols]
ans.columns = abv_cols
both = gt_three[gt_three['both'] == True]
both = both[both_cols]
both.columns = abv_cols
overall = gt_three[both_cols]
overall.columns = abv_cols
print 'ASKERS'
# ask['between'] = normalize_column(ask['between'])
print ask.describe().round(4).loc[['mean', 'std']]
print ''
print 'ANSWERERS'
# ans['between'] = normalize_column(ans['between'])
print ans.describe().round(4).loc[['mean', 'std']]
print ''
print 'BOTH'
# both['between'] = normalize_column(both['between'])
print both.describe().round(4).loc[['mean', 'std']]
print ''
print 'OVERALL'
# overall['between'] = normalize_column(overall['between'])
print overall.describe().round(4).loc[['mean', 'std']]
In [624]:
neg_abv_cols = ['closeness', 'out', 'between', 'neighb', 'clust_coef', 'degree', 'hub']
neg_ans_cols = ['closeness', 'out', 'betweenness', 'neighborhood', 'clustering_coef', 'degree', 'hub']
ask = gt_three[gt_three['ask'] == True]
ask = ask[neg_ans_cols]
ask.columns = neg_abv_cols
ans = gt_three[gt_three['ans'] == True]
ans = ans[neg_ans_cols]
ans.columns = neg_abv_cols
both = gt_three[gt_three['both'] == True]
both = both[neg_ans_cols]
both.columns = neg_abv_cols
overall = gt_three[neg_ans_cols]
overall.columns = neg_abv_cols
print 'ASKERS'
ask['between'] = normalize_column(ask['between'])
print ask.describe().round(4).loc[['mean', 'std']]
print ''
print 'ANSWERERS'
ans['between'] = normalize_column(ans['between'])
print ans.describe().round(4).loc[['mean', 'std']]
print ''
print 'BOTH'
both['between'] = normalize_column(both['between'])
print both.describe().round(4).loc[['mean', 'std']]
print ''
print 'OVERALL'
overall['between'] = normalize_column(overall['between'])
print overall.describe().round(4).loc[['mean', 'std']]
In [610]:
org_clf = LinearSVC(random_state=0)
org_clf.fit(gt_feat_norm, gt_labels['both'])
gt_coef = both_clf_clf.coef_.ravel()
gt_importance = pd.DataFrame(gt_coef, index=gt_feat_norm.columns, columns=['value'])#, columns=features.columns)
gt_pos_class = gt_importance[gt_importance['value'] > 0]
gt_neg_class = gt_importance[gt_importance['value'] < 0]
print 'ACCURACY:', gt_ans_clf.score(gt_feat_norm, gt_labels['ans'])
print ''
print gt_pos_class.sort_values(by='value', ascending=False)
print ''
print gt_neg_class.sort_values(by='value', ascending=True)
In [626]:
org_cols = ['core', 'eigen_centrality', 'avg_neighborhood', 'pagerank', 'authority', 'in']
abv_cols = ['core', 'eigen', 'avg_neigh', 'pagerank', 'authority', 'in']
org = gt_three[gt_three['org'] == True]
org = org[org_cols]
org.columns = abv_cols
not_org = gt_three[gt_three['org'] == False]
not_org = not_org[org_cols]
not_org.columns = abv_cols
total = gt_three[org_cols]
total.columns = abv_cols
print 'ORGANIZERS'
# org['betweenness'] = normalize_column(org['betweenness'])
print org.describe().round(4).loc[['mean', 'std']]
print ''
print 'NOT ORGANIZERS'
# not_org['betweenness'] = normalize_column(not_org['betweenness'])
print not_org.describe().round(4).loc[['mean', 'std']]
print ''
print 'ORIGINALS'
# org_total['betweenness'] = normalize_column(org_total['betweenness'])
print total.describe().round(4).loc[['mean', 'std']]
In [ ]:
BOTH
core eigen avg_neigh pagerank authority in
mean 12.1895 0.0517 487.1042 0.0052 0.0332 24.1158
std 6.4219 0.1425 474.9928 0.0171 0.1164 57.8238
In [632]:
neg_org_cols = ['closeness', 'out', 'betweenness', 'neighborhood', 'clustering_coef', 'degree', 'hub']
neg_abv_cols = ['close', 'out', 'between', 'neigh', 'clust_coef', 'deg', 'hub']
org = gt_three[gt_three['org'] == True]
org = org[neg_org_cols]
org.columns = neg_abv_cols
not_org = gt_three[gt_three['org'] == False]
not_org = not_org[neg_org_cols]
not_org.columns = neg_abv_cols
total = gt_three[neg_org_cols]
total.columns = neg_abv_cols
print 'ORGANIZERS'
org['between'] = normalize_column(org['between'])
print org.describe().round(4).loc[['mean', 'std']]
print ''
print 'NOT ORGANIZERS'
not_org['between'] = normalize_column(not_org['between'])
print not_org.describe().round(4).loc[['mean', 'std']]
print ''
print 'ORIGINALS'
total['between'] = normalize_column(total['between'])
print total.describe().round(4).loc[['mean', 'std']]
In [ ]:
BOTH
close out between neighb clust_coef deg hub
mean 0.0512 21.2526 3.7229 37.9895 0.6507 45.3684 0.0602
std 0.0022 58.3994 13.0320 93.5025 0.2690 115.5835 0.1431
In [612]:
org = gt_three[gt_three['org'] == True]
both = gt_three[gt_three['both'] == True]
both = both[both_cols]
print 'ORG:', org.shape, 'BOTH:', both.shape
In [411]:
#http://www.pythonexample.com/code/plot-degree-distribution-igraph-python/
def plotDegreeDistribution(degrees):
degs = defaultdict(int)
for i in degrees:
degs[i]+=1
items = sorted ( degs.items () )
x, y = np.array(items).T
y = [float(i) / sum(y) for i in y]
plt.figure(figsize=(10,8))
plt.plot(x, y, 'bo')
plt.xscale('log')
plt.yscale('log')
plt.legend(['Degree'])
plt.xlabel('$K$', fontsize = 20)
plt.ylabel('$P_K$', fontsize = 20)
plt.title('$Degree\,Distribution$', fontsize = 20)
plt.show()
In [430]:
def normalize_column(col):
result = col.copy()
max_value = col.max()
min_value = col.min()
result = ( (col - min_value) / (max_value - min_value) ) * 100
return result