In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
PATH = os.getcwd()
train_df = pd.read_csv(f'{PATH}\\train.csv', low_memory=False)
test_df = pd.read_csv(f'{PATH}\\test.csv', low_memory=False)
def display_all(df):
with pd.option_context("display.max_rows", 100):
with pd.option_context("display.max_columns", 100):
display(df)
In [2]:
'''
**Problem Statement**
An online question and answer platform has hired you as a data scientist to identify the best question authors on the platform.
This identification will bring more insight into increasing the user engagement. Given the tag of the question, number of views
received, number of answers, username and reputation of the question author, the problem requires you to predict the upvote
count that the question will receive.
**DATA DICTIONARY**
- Variable ----------- Definition
- ID ----------- Question ID
- Tag ----------- Anonymised tags representing question category
- Reputation ----------- Reputation score of question author
- Answers ----------- Number of times question has been answered
- Username ----------- Anonymised user id of question author
- Views ----------- Number of times question has been viewed
- Upvotes (Target) ----------- Number of upvotes for the question
''';
In [3]:
train_df.head()
Out[3]:
In [4]:
'Train', train_df.shape, len(set(train_df.ID.values)), len(set(train_df.Username.values)), 'Test', \
test_df.shape, len(set(test_df.ID.values)), len(set(test_df.Username.values))
Out[4]:
In [3]:
def Intersection(lst1, lst2):
return len(list(set(lst1).intersection(lst2)))
In [7]:
Intersection(train_df.Username, test_df.Username),\
Intersection(train_df.Reputation, test_df.Reputation),\
Intersection(train_df.Views, test_df.Views)
Out[7]:
First of all, let's see how many different User's we have on both datasets
In [8]:
man_train_list = train_df.Username.unique()
man_test_list = test_df.Username.unique()
print("Train: {0}".format(len(man_train_list)))
print("Test: {0}".format(len(man_test_list)))
In [4]:
temp1 = train_df.groupby('Username').count().iloc[:,-1]
temp2 = test_df.groupby('Username').count().iloc[:,-1]
df_man = pd.concat([temp1,temp2], axis = 1, join = 'outer')
df_man.columns = ['train_count','test_count']
print(df_man.head(20))
In [13]:
print(df_man.sort_values(by = 'train_count', ascending = False).head(20))
In [20]:
fig, axes = plt.subplots(1,2, figsize = (12,5))
temp = df_man['train_count'].dropna().sort_values(ascending = False).reset_index(drop = True)
axes[0].plot(temp.index+1, temp.cumsum()/temp.sum())
axes[0].set_title('cumulative train_count');
temp = df_man['test_count'].dropna().sort_values(ascending = False).reset_index(drop = True)
axes[1].plot(temp.index+1, temp.cumsum()/temp.sum())
axes[1].set_title('cumulative test_count');
In [21]:
ix20 = int(len(df_man['train_count'].dropna())*0.2)
print("TRAIN: 20% of man ({0}) responsible for {1:2.2f}% of entries".format(ix20,df_man['train_count'].sort_values(ascending = False).cumsum().iloc[ix20]/df_man['train_count'].sum()*100))
ix20 = int(len(df_man['test_count'].dropna())*0.2)
print("TEST: 20% of man ({0}) responsible for {1:2.2f}% of entries".format(ix20, df_man['test_count'].sort_values(ascending = False).cumsum().iloc[ix20]/df_man['test_count'].sum()*100))
In [23]:
man_not_in_test = set(man_train_list) - set(man_test_list)
man_not_in_train = set(man_test_list) - set(man_train_list)
print("{} man are featured in train but not in test".format(len(man_not_in_test)))
print("{} man are featured in test but not in train".format(len(man_not_in_train)))
In [120]:
train_df.loc[list(man_not_in_test)].head()
## Need to drop them blindly...
#train_df.drop(index = train_df.loc[list(man_not_in_test)].index, inplace=True).shape
Out[120]:
In [24]:
print(df_man.loc[list(man_not_in_test)]['train_count'].describe())
print(df_man.loc[list(man_not_in_train)]['test_count'].describe())
In [26]:
df_man.sort_values(by = 'train_count', ascending = False).head(1000).corr()
Out[26]:
In [5]:
df_man.sort_values(by = 'train_count', ascending = False).plot.scatter(x = 'train_count', y = 'test_count')
Out[5]:
In [37]:
temp = df_man['train_count'].sort_values(ascending = False).head(50000)
temp = pd.concat([temp,temp.cumsum()/df_man['train_count'].sum()*100], axis = 1).reset_index()
temp.columns = ['user_id','count','percentage']
print(temp)
In [41]:
set(train_df.Tag)
Out[41]:
In [15]:
man_list = df_man['train_count'].sort_values(ascending = False).index
ixes = train_df.Username.isin(man_list)
df10000 = train_df[ixes][['Username','Tag']]
tags_dummies = pd.get_dummies(df10000.Tag)
df10000 = pd.concat([df10000,tags_dummies[['a', 'c', 'h', 'i', 'j', 'o', 'p', 'r', 's', 'x']]], axis = 1).drop('Tag', axis = 1)
print("The contributors account for {} entries\n".format(len(df10000)))
print(df10000.head(10))
In [18]:
df10000.head(2)
Out[18]:
In [17]:
import itertools
last_names = ['Mary', 'Patricia', 'Linda', 'Barbara', 'Elizabeth',
'Jennifer', 'Maria', 'Susan', 'Margaret', 'Dorothy',
'James', 'John', 'Robert', 'Michael', 'William', 'David',
'Richard', 'Charles', 'Joseph', 'Thomas','Smith', 'Johnson', 'Williams', 'Jones', 'Brown', 'Davis', 'Miller', 'Wilson', 'Moore',
'Taylor', 'Anderson', 'Thomas', 'Jackson', 'White', 'Harris', 'Martin', 'Thompson', 'Garcia',
'Martinez', 'Robinson', 'Clark', 'Rodriguez', 'Lewis', 'Lee', 'Walker', 'Hall', 'Allen', 'Young',
'Hernandez', 'King', 'Wright', 'Lopez', 'Hill', 'Scott', 'Green', 'Adams', 'Baker', 'Gonzalez', 'Nelson',
'Carter', 'Mitchell', 'Perez', 'Roberts', 'Turner', 'Phillips', 'Campbell', 'Parker', 'Evans', 'Edwards', 'Collins']
first_names = ['Smith', 'Johnson', 'Williams', 'Jones', 'Brown', 'Davis', 'Miller', 'Wilson', 'Moore',
'Taylor', 'Anderson', 'Thomas', 'Jackson', 'White', 'Harris', 'Martin', 'Thompson', 'Garcia',
'Martinez', 'Robinson', 'Clark', 'Rodriguez', 'Lewis', 'Lee', 'Walker', 'Hall', 'Allen', 'Young',
'Hernandez', 'King', 'Wright', 'Lopez', 'Hill', 'Scott', 'Green', 'Adams', 'Baker', 'Gonzalez', 'Nelson',
'Carter', 'Mitchell', 'Perez', 'Roberts', 'Turner', 'Phillips', 'Campbell', 'Parker', 'Evans', 'Edwards', 'Collins','Mary', 'Patricia', 'Linda', 'Barbara', 'Elizabeth',
'Jennifer', 'Maria', 'Susan', 'Margaret', 'Dorothy',
'James', 'John', 'Robert', 'Michael', 'William', 'David',
'Richard', 'Charles', 'Joseph', 'Thomas']
names = [first + ' ' + last for first,last in (itertools.product(first_names, last_names))]
# shuffle them
np.random.seed(12345)
np.random.shuffle(names)
dictionary = dict(zip(man_list, names))
df10000.loc[df10000.Username.isin(dictionary), 'Username' ] = df10000['Username'].map(dictionary)
print(df10000.head())
In [19]:
# see if the name coincides
print(names[:10])
print(df10000.groupby('Username').count().sort_values(by = 'a', ascending = False).head(10))
In [20]:
gby = pd.concat([df10000.groupby('Username').mean(),df10000.groupby('Username').count()], axis = 1).iloc[:,:-9]
gby.columns = ['a', 'c', 'h', 'i', 'j', 'o', 'p', 'r', 's', 'x', 'count']
gby.sort_values(by = 'count', ascending = False).head(10)[['a', 'c', 'h', 'i', 'j', 'o', 'p', 'r', 's', 'x', 'count']]
Out[20]:
In [11]:
gby.sort_values(by = 'count', ascending = False).head(100).drop('count', axis = 1).plot(kind = 'bar', stacked = True, figsize = (15,6))
plt.figure()
gby.sort_values(by = 'count', ascending = False)['count'].head(100).plot(kind = 'bar', figsize = (15,6));
I think this high diversity should be accounted for when building our predictive model!
It would be interesting to rank the man based on their interest levels(the tags). For instance, we could compute their "skill" by assigning 0 points for "a", 1 for "b" and 2 for "c"...
In [96]:
gby.head(2)
Out[96]:
In [112]:
pd.concat([train_df['Tag'].value_counts().sort_values(ascending=False),test_df['Tag'].value_counts().sort_values(ascending=False)],sort=False, axis =1,\
keys=['Train_Stats', 'Test_Stats'])
Out[112]:
In [114]:
gby['skill'] = gby['r']*1 + gby['o']*2 + gby['h']*3 + gby['s']*4 + gby['a']*5 + gby['i']*6 + gby['p']*7 + gby['j']*8 \
+ gby['c']*9
print("Top performers")
gby.sort_values(by = 'skill', ascending = False).reset_index().head()
Out[114]:
In [115]:
print("\nWorst performers")
gby.sort_values(by = 'skill', ascending = False).reset_index().tail()
Out[115]:
In [117]:
gby.skill.plot(kind = 'hist', bins=10)
print(gby.mean())