AV_Enigma_ML_initial_eda


Imports


In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary

PATH = os.getcwd()

train_df = pd.read_csv(f'{PATH}\\train.csv', low_memory=False)
test_df = pd.read_csv(f'{PATH}\\test.csv', low_memory=False)

def display_all(df):
    with pd.option_context("display.max_rows", 100): 
        with pd.option_context("display.max_columns", 100): 
            display(df)

In [2]:
'''
**Problem Statement**

An online question and answer platform has hired you as a data scientist to identify the best question authors on the platform.
This identification will bring more insight into increasing the user engagement. Given the tag of the question, number of views
received, number of answers, username and reputation of the question author, the problem requires you to predict the upvote 
count that the question will receive.

**DATA DICTIONARY**
 
- Variable	          ----------- Definition
- ID	              ----------- Question ID
- Tag	              ----------- Anonymised tags representing question category
- Reputation	      ----------- Reputation score of question author
- Answers	          ----------- Number of times question has been answered
- Username	          ----------- Anonymised user id of question author
- Views	              ----------- Number of times question has been viewed
- Upvotes (Target)    ----------- Number of upvotes for the question
''';

In [3]:
train_df.head()


Out[3]:
ID Tag Reputation Answers Username Views Upvotes
0 52664 a 3942.0 2.0 155623 7855.0 42.0
1 327662 a 26046.0 12.0 21781 55801.0 1175.0
2 468453 c 1358.0 4.0 56177 8067.0 60.0
3 96996 a 264.0 3.0 168793 27064.0 9.0
4 131465 c 4271.0 4.0 112223 13986.0 83.0

In [4]:
'Train', train_df.shape, len(set(train_df.ID.values)), len(set(train_df.Username.values)), 'Test', \
test_df.shape, len(set(test_df.ID.values)), len(set(test_df.Username.values))


Out[4]:
('Train', (330045, 7), 330045, 141802, 'Test', (141448, 6), 141448, 79351)

In [3]:
def Intersection(lst1, lst2):
    return len(list(set(lst1).intersection(lst2)))

Username


In [7]:
Intersection(train_df.Username, test_df.Username),\
Intersection(train_df.Reputation, test_df.Reputation),\
Intersection(train_df.Views, test_df.Views)


Out[7]:
(45414, 13211, 37143)

First of all, let's see how many different User's we have on both datasets


In [8]:
man_train_list = train_df.Username.unique()
man_test_list = test_df.Username.unique()
print("Train: {0}".format(len(man_train_list)))
print("Test: {0}".format(len(man_test_list)))


Train: 141802
Test: 79351
  • Unique User's in Test are close to $1$/$2$ of user's in Train...

In [4]:
temp1 = train_df.groupby('Username').count().iloc[:,-1]
temp2 = test_df.groupby('Username').count().iloc[:,-1]
df_man = pd.concat([temp1,temp2], axis = 1, join = 'outer')
df_man.columns = ['train_count','test_count']
print(df_man.head(20))


          train_count  test_count
Username                         
0                 1.0         NaN
1                 1.0         NaN
2                 1.0         NaN
3                 1.0         NaN
4                 NaN         1.0
5                 1.0         NaN
6                 1.0         NaN
7                 NaN         1.0
8                 NaN         1.0
9                 1.0         NaN
10                1.0         NaN
11                1.0         NaN
12                1.0         NaN
13                1.0         NaN
14                1.0         NaN
15                NaN         1.0
16                NaN         1.0
17                1.0         NaN
18                1.0         NaN
19                NaN         1.0
  • Some Users have entries only in one of the two datasets

In [13]:
print(df_man.sort_values(by = 'train_count', ascending = False).head(20))


          train_count  test_count
Username                         
4118            797.0       336.0
17878           608.0       228.0
45704           483.0       205.0
23223           413.0       199.0
62142           408.0       200.0
6697            408.0       190.0
94185           391.0       152.0
59362           355.0       180.0
61217           337.0       151.0
41953           328.0       138.0
12074           326.0       134.0
82170           321.0       136.0
93544           320.0       116.0
58117           310.0       119.0
67129           297.0       168.0
21641           292.0       127.0
22010           269.0       119.0
42835           261.0        96.0
78813           248.0       122.0
80714           248.0       117.0
  • More clearer if one looks at the plots for the cumulative distributions.

In [20]:
fig, axes = plt.subplots(1,2, figsize = (12,5))
temp = df_man['train_count'].dropna().sort_values(ascending = False).reset_index(drop = True)
axes[0].plot(temp.index+1, temp.cumsum()/temp.sum())
axes[0].set_title('cumulative train_count');

temp = df_man['test_count'].dropna().sort_values(ascending = False).reset_index(drop = True)
axes[1].plot(temp.index+1, temp.cumsum()/temp.sum())
axes[1].set_title('cumulative test_count');



In [21]:
ix20 = int(len(df_man['train_count'].dropna())*0.2)
print("TRAIN: 20% of man ({0}) responsible for {1:2.2f}% of entries".format(ix20,df_man['train_count'].sort_values(ascending = False).cumsum().iloc[ix20]/df_man['train_count'].sum()*100))

ix20 = int(len(df_man['test_count'].dropna())*0.2)
print("TEST: 20% of man ({0}) responsible for {1:2.2f}% of entries".format(ix20, df_man['test_count'].sort_values(ascending = False).cumsum().iloc[ix20]/df_man['test_count'].sum()*100))


TRAIN: 20% of man (28360) responsible for 58.36% of entries
TEST: 20% of man (15870) responsible for 51.00% of entries
  • The man featuring in only one of the two datasets usually have very few entries.

In [23]:
man_not_in_test = set(man_train_list) - set(man_test_list)
man_not_in_train = set(man_test_list) - set(man_train_list)

print("{} man are featured in train but not in test".format(len(man_not_in_test)))
print("{} man are featured in test but not in train".format(len(man_not_in_train)))


96388 man are featured in train but not in test
33937 man are featured in test but not in train

In [120]:
train_df.loc[list(man_not_in_test)].head()
## Need to drop them blindly...
#train_df.drop(index = train_df.loc[list(man_not_in_test)].index, inplace=True).shape


Out[120]:
ID Tag Reputation Answers Username Views Upvotes
0 52664 a 3942.0 2.0 155623 7855.0 42.0
1 327662 a 26046.0 12.0 21781 55801.0 1175.0
2 468453 c 1358.0 4.0 56177 8067.0 60.0
3 96996 a 264.0 3.0 168793 27064.0 9.0
5 345568 r 440.0 1.0 23866 5577.0 4.0
6 376865 c 147.0 1.0 10759 17184.0 17.0
9 130054 c 2749.0 1.0 10189 227.0 0.0
10 218471 c 1142.0 7.0 71822 141836.0 166.0
11 144119 p 2554.0 1.0 137817 8086.0 42.0
12 155955 s 591.0 4.0 36550 16102.0 19.0
13 442549 h 394.0 1.0 83356 266.0 2.0
14 243606 o 2989.0 2.0 13728 5579.0 10.0
17 92433 j 1655.0 3.0 101561 3949.0 8.0
18 46038 c 922.0 2.0 132697 1133.0 9.0
21 167622 j 166.0 1.0 37105 1881.0 0.0
22 347488 j 774.0 3.0 108729 7169.0 10.0
26 223188 c 569.0 12.0 116775 38443.0 57.0
28 8721 i 3376.0 5.0 137432 11389.0 47.0
29 131779 s 380.0 3.0 15975 1323.0 5.0
30 106895 a 347.0 3.0 154557 22195.0 8.0
31 420600 r 76.0 2.0 96879 8668.0 1.0
32 293535 j 1740.0 2.0 16515 12630.0 40.0
36 347323 r 54.0 1.0 81259 886.0 2.0
37 254132 h 18404.0 13.0 73061 118423.0 2963.0
38 386883 c 46.0 9.0 167231 8466.0 25.0
39 245452 p 16272.0 8.0 59430 31055.0 451.0
41 437170 o 627.0 4.0 120144 35104.0 27.0
43 265741 p 70.0 4.0 171611 2660.0 10.0
46 186056 i 0.0 22.0 45812 45331.0 30.0
47 434154 i 17465.0 4.0 19956 47336.0 1013.0
... ... ... ... ... ... ... ...
175685 419023 c 48.0 4.0 3867 390.0 9.0
175686 134866 i 1275.0 1.0 114962 468.0 2.0
175689 195185 c 502.0 3.0 105475 1156.0 5.0
175690 246639 s 354.0 4.0 37778 6453.0 8.0
175692 104292 c 86.0 2.0 10479 11188.0 16.0
175695 292430 r 394.0 2.0 80923 76417.0 39.0
175696 187967 h 1701.0 10.0 15299 30663.0 51.0
175698 89657 s 1335.0 7.0 27838 5250.0 27.0
175701 63811 c 1494.0 2.0 67178 3204.0 1.0
175702 256345 c 248.0 5.0 78813 21298.0 13.0
175705 308463 j 60.0 5.0 20973 2409.0 6.0
175706 279978 c 14612.0 2.0 83294 934.0 21.0
175708 373646 x 648.0 4.0 108992 6321.0 14.0
175711 295925 j 826.0 5.0 108921 65772.0 54.0
175712 218242 c 121.0 5.0 129983 81017.0 11.0
175713 92381 j 1195.0 4.0 143681 31810.0 53.0
175716 293218 a 266.0 7.0 37017 73362.0 53.0
175719 141928 c 58.0 3.0 21088 44047.0 12.0
175720 341406 h 3093.0 2.0 41248 1468.0 8.0
175724 307320 c 339.0 2.0 29252 23742.0 57.0
175725 402402 o 17363.0 7.0 27395 4485.0 51.0
175728 361017 c 18209.0 2.0 130137 8118.0 208.0
175729 338716 o 31897.0 3.0 106196 34537.0 1246.0
175730 287321 p 3060.0 6.0 4759 19198.0 75.0
175732 220213 c 87.0 3.0 56109 26727.0 109.0
175733 81393 p 494.0 2.0 79773 329.0 2.0
175734 16076 c 4786.0 6.0 99345 145441.0 654.0
175735 161486 j 9751.0 11.0 22428 78486.0 516.0
175736 289853 p 4581.0 4.0 33986 36546.0 311.0
175738 214165 h 24025.0 4.0 9498 22801.0 1008.0

96388 rows × 7 columns


In [24]:
print(df_man.loc[list(man_not_in_test)]['train_count'].describe())
print(df_man.loc[list(man_not_in_train)]['test_count'].describe())


count    96388.000000
mean         1.406721
std          0.902468
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max         17.000000
Name: train_count, dtype: float64
count    33937.000000
mean         1.118337
std          0.379317
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          6.000000
Name: test_count, dtype: float64
  • Strong Correlation among them

In [26]:
df_man.sort_values(by = 'train_count', ascending = False).head(1000).corr()


Out[26]:
train_count test_count
train_count 1.000000 0.974016
test_count 0.974016 1.000000

In [5]:
df_man.sort_values(by = 'train_count', ascending = False).plot.scatter(x = 'train_count', y = 'test_count')


Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x2568ac5b198>

In [37]:
temp = df_man['train_count'].sort_values(ascending = False).head(50000)
temp = pd.concat([temp,temp.cumsum()/df_man['train_count'].sum()*100], axis = 1).reset_index()
temp.columns = ['user_id','count','percentage']
print(temp)


       user_id  count  percentage
0         4118  797.0    0.241482
1        17878  608.0    0.425700
2        45704  483.0    0.572043
3        23223  413.0    0.697178
4         6697  408.0    0.820797
5        62142  408.0    0.944417
6        94185  391.0    1.062885
7        59362  355.0    1.170446
8        61217  337.0    1.272554
9        41953  328.0    1.371934
10       12074  326.0    1.470709
11       82170  321.0    1.567968
12       93544  320.0    1.664924
13       58117  310.0    1.758851
14       67129  297.0    1.848839
15       21641  292.0    1.937312
16       22010  269.0    2.018816
17       42835  261.0    2.097896
18       78813  248.0    2.173037
19       80714  248.0    2.248178
20       88986  241.0    2.321199
21       58656  233.0    2.391795
22       71408  231.0    2.461786
23       23032  222.0    2.529049
24       72364  214.0    2.593889
25        2177  209.0    2.657213
26       45253  206.0    2.719629
27       14828  202.0    2.780833
28       93065  201.0    2.841734
29       35130  200.0    2.902332
...        ...    ...         ...
49970   126547    2.0   71.450560
49971    30930    2.0   71.451166
49972      962    2.0   71.451772
49973      960    2.0   71.452378
49974      959    2.0   71.452984
49975   128607    2.0   71.453590
49976   152512    2.0   71.454196
49977   153401    2.0   71.454802
49978   173242    2.0   71.455408
49979    32122    2.0   71.456014
49980    15598    2.0   71.456620
49981   128109    2.0   71.457226
49982    32121    2.0   71.457832
49983   126819    2.0   71.458437
49984   128112    2.0   71.459043
49985   152614    2.0   71.459649
49986    31255    2.0   71.460255
49987   173154    2.0   71.460861
49988    31253    2.0   71.461467
49989   155217    2.0   71.462073
49990    16078    2.0   71.462679
49991   174104    2.0   71.463285
49992   126812    2.0   71.463891
49993    13838    2.0   71.464497
49994    31249    2.0   71.465103
49995   174107    2.0   71.465709
49996   128123    2.0   71.466315
49997     1684    2.0   71.466921
49998   173155    2.0   71.467527
49999    31261    2.0   71.468133

[50000 rows x 3 columns]

In [41]:
set(train_df.Tag)


Out[41]:
{'a', 'c', 'h', 'i', 'j', 'o', 'p', 'r', 's', 'x'}

In [15]:
man_list = df_man['train_count'].sort_values(ascending = False).index
ixes = train_df.Username.isin(man_list)
df10000 = train_df[ixes][['Username','Tag']]
tags_dummies = pd.get_dummies(df10000.Tag)
df10000 = pd.concat([df10000,tags_dummies[['a', 'c', 'h', 'i', 'j', 'o', 'p', 'r', 's', 'x']]], axis = 1).drop('Tag', axis = 1)

print("The contributors account for {} entries\n".format(len(df10000)))
print(df10000.head(10))


The contributors account for 330045 entries

   Username  a  c  h  i  j  o  p  r  s  x
0    155623  1  0  0  0  0  0  0  0  0  0
1     21781  1  0  0  0  0  0  0  0  0  0
2     56177  0  1  0  0  0  0  0  0  0  0
3    168793  1  0  0  0  0  0  0  0  0  0
4    112223  0  1  0  0  0  0  0  0  0  0
5     23866  0  0  0  0  0  0  0  1  0  0
6     10759  0  1  0  0  0  0  0  0  0  0
7     54623  0  0  0  0  1  0  0  0  0  0
8    172926  0  0  0  0  1  0  0  0  0  0
9     10189  0  1  0  0  0  0  0  0  0  0

In [18]:
df10000.head(2)


Out[18]:
Username a c h i j o p r s x
0 155623 1 0 0 0 0 0 0 0 0 0
1 21781 1 0 0 0 0 0 0 0 0 0

In [17]:
import itertools

last_names = ['Mary',  'Patricia',  'Linda',  'Barbara',  'Elizabeth',  
               'Jennifer',  'Maria',  'Susan',  'Margaret',  'Dorothy',
               'James', 'John', 'Robert', 'Michael', 'William', 'David',
               'Richard', 'Charles', 'Joseph', 'Thomas','Smith', 'Johnson', 'Williams', 'Jones', 'Brown', 'Davis', 'Miller', 'Wilson', 'Moore', 
 'Taylor', 'Anderson', 'Thomas', 'Jackson', 'White', 'Harris', 'Martin', 'Thompson', 'Garcia', 
 'Martinez', 'Robinson', 'Clark', 'Rodriguez', 'Lewis', 'Lee', 'Walker', 'Hall', 'Allen', 'Young',
 'Hernandez', 'King', 'Wright', 'Lopez', 'Hill', 'Scott', 'Green', 'Adams', 'Baker', 'Gonzalez', 'Nelson', 
 'Carter', 'Mitchell', 'Perez', 'Roberts', 'Turner', 'Phillips', 'Campbell', 'Parker', 'Evans', 'Edwards', 'Collins']
 
first_names = ['Smith', 'Johnson', 'Williams', 'Jones', 'Brown', 'Davis', 'Miller', 'Wilson', 'Moore', 
 'Taylor', 'Anderson', 'Thomas', 'Jackson', 'White', 'Harris', 'Martin', 'Thompson', 'Garcia', 
 'Martinez', 'Robinson', 'Clark', 'Rodriguez', 'Lewis', 'Lee', 'Walker', 'Hall', 'Allen', 'Young',
 'Hernandez', 'King', 'Wright', 'Lopez', 'Hill', 'Scott', 'Green', 'Adams', 'Baker', 'Gonzalez', 'Nelson', 
 'Carter', 'Mitchell', 'Perez', 'Roberts', 'Turner', 'Phillips', 'Campbell', 'Parker', 'Evans', 'Edwards', 'Collins','Mary',  'Patricia',  'Linda',  'Barbara',  'Elizabeth',  
               'Jennifer',  'Maria',  'Susan',  'Margaret',  'Dorothy',
               'James', 'John', 'Robert', 'Michael', 'William', 'David',
               'Richard', 'Charles', 'Joseph', 'Thomas']

names = [first + ' ' + last for first,last in (itertools.product(first_names, last_names))]

# shuffle them
np.random.seed(12345)
np.random.shuffle(names)

dictionary = dict(zip(man_list, names))
df10000.loc[df10000.Username.isin(dictionary), 'Username' ] = df10000['Username'].map(dictionary)
print(df10000.head())


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-17-05c8e3a96bb9> in <module>()
     19                'Richard', 'Charles', 'Joseph', 'Thomas']
     20 
---> 21 names = [first+ + ' ' + last for first,last in (itertools.product(first_names, last_names))]
     22 
     23 # shuffle them

<ipython-input-17-05c8e3a96bb9> in <listcomp>(.0)
     19                'Richard', 'Charles', 'Joseph', 'Thomas']
     20 
---> 21 names = [first+ + ' ' + last for first,last in (itertools.product(first_names, last_names))]
     22 
     23 # shuffle them

TypeError: bad operand type for unary +: 'str'

In [19]:
# see if the name coincides
print(names[:10])
print(df10000.groupby('Username').count().sort_values(by = 'a', ascending = False).head(10))


['Barbara Smith', 'Green Wright', 'Adams Jackson', 'Miller Campbell', 'Taylor Martin', 'Williams Lopez', 'Wright Lee', 'James James', 'Patricia Lee', 'Jackson Anderson']
            a    c    h    i    j    o    p    r    s    x
Username                                                  
4118      797  797  797  797  797  797  797  797  797  797
17878     608  608  608  608  608  608  608  608  608  608
45704     483  483  483  483  483  483  483  483  483  483
23223     413  413  413  413  413  413  413  413  413  413
62142     408  408  408  408  408  408  408  408  408  408
6697      408  408  408  408  408  408  408  408  408  408
94185     391  391  391  391  391  391  391  391  391  391
59362     355  355  355  355  355  355  355  355  355  355
61217     337  337  337  337  337  337  337  337  337  337
41953     328  328  328  328  328  328  328  328  328  328

In [20]:
gby = pd.concat([df10000.groupby('Username').mean(),df10000.groupby('Username').count()], axis = 1).iloc[:,:-9]
gby.columns = ['a', 'c', 'h', 'i', 'j', 'o', 'p', 'r', 's', 'x', 'count']
gby.sort_values(by = 'count', ascending = False).head(10)[['a', 'c', 'h', 'i', 'j', 'o', 'p', 'r', 's', 'x', 'count']]


Out[20]:
a c h i j o p r s x count
Username
4118 0.075282 0.208281 0.080301 0.048934 0.212045 0.040151 0.154329 0.090339 0.075282 0.015056 797
17878 0.074013 0.213816 0.054276 0.129934 0.167763 0.075658 0.118421 0.044408 0.085526 0.036184 608
45704 0.080745 0.215321 0.072464 0.072464 0.217391 0.022774 0.190476 0.033126 0.068323 0.026915 483
23223 0.077482 0.227603 0.060533 0.065375 0.261501 0.048426 0.147700 0.033898 0.072639 0.004843 413
62142 0.080882 0.183824 0.073529 0.075980 0.264706 0.024510 0.156863 0.036765 0.073529 0.029412 408
6697 0.085784 0.178922 0.056373 0.171569 0.132353 0.115196 0.127451 0.029412 0.093137 0.009804 408
94185 0.063939 0.199488 0.102302 0.056266 0.289003 0.020460 0.171355 0.023018 0.061381 0.012788 391
59362 0.104225 0.242254 0.061972 0.061972 0.228169 0.022535 0.166197 0.022535 0.070423 0.019718 355
61217 0.097923 0.169139 0.059347 0.112760 0.195846 0.062315 0.151335 0.050445 0.065282 0.035608 337
41953 0.106707 0.198171 0.079268 0.079268 0.189024 0.021341 0.167683 0.057927 0.079268 0.021341 328
  • Their performances seem very different, even for people with similar number of entries.

In [11]:
gby.sort_values(by = 'count', ascending = False).head(100).drop('count', axis = 1).plot(kind = 'bar', stacked = True, figsize = (15,6))
plt.figure()
gby.sort_values(by = 'count', ascending = False)['count'].head(100).plot(kind = 'bar', figsize = (15,6));


I think this high diversity should be accounted for when building our predictive model!

It would be interesting to rank the man based on their interest levels(the tags). For instance, we could compute their "skill" by assigning 0 points for "a", 1 for "b" and 2 for "c"...


In [96]:
gby.head(2)


Out[96]:
a c h i j o p r s x count
Username
Adams Jackson 0.080745 0.215321 0.072464 0.072464 0.217391 0.022774 0.190476 0.033126 0.068323 0.026915 483
Allen Collins 0.072993 0.313869 0.080292 0.058394 0.189781 0.029197 0.131387 0.014599 0.072993 0.036496 137

In [112]:
pd.concat([train_df['Tag'].value_counts().sort_values(ascending=False),test_df['Tag'].value_counts().sort_values(ascending=False)],sort=False, axis =1,\
          keys=['Train_Stats', 'Test_Stats'])


Out[112]:
Train_Stats Test_Stats
c 72458 30793
j 72232 30811
p 43407 18713
i 32400 14100
a 31695 13558
s 23323 10088
h 20564 8782
o 14546 6296
r 12442 5313
x 6978 2994

In [114]:
gby['skill'] = gby['r']*1 + gby['o']*2 + gby['h']*3 + gby['s']*4 + gby['a']*5 + gby['i']*6 + gby['p']*7 + gby['j']*8 \
+ gby['c']*9  

print("Top performers")
gby.sort_values(by = 'skill', ascending = False).reset_index().head()


Top performers
Out[114]:
Username a c h i j o p r s x count skill
0 Walker Garcia 0.020202 0.929293 0.000000 0.000000 0.010101 0.0 0.000000 0.0 0.030303 0.010101 99 8.666667
1 Johnson Parker 0.000000 0.850000 0.037500 0.000000 0.025000 0.0 0.062500 0.0 0.012500 0.012500 80 8.450000
2 Mary Margaret 0.140187 0.794393 0.000000 0.000000 0.065421 0.0 0.000000 0.0 0.000000 0.000000 107 8.373832
3 Davis Richard 0.038095 0.657143 0.085714 0.000000 0.095238 0.0 0.057143 0.0 0.066667 0.000000 105 7.790476
4 Edwards Wilson 0.005495 0.653846 0.186813 0.021978 0.120879 0.0 0.010989 0.0 0.000000 0.000000 182 7.648352

In [115]:
print("\nWorst performers")
gby.sort_values(by = 'skill', ascending = False).reset_index().tail()


Worst performers
Out[115]:
Username a c h i j o p r s x count skill
95 Maria Jones 0.937500 0.025000 0.000000 0.000000 0.012500 0.000000 0.000000 0.000000 0.000000 0.025000 80 5.012500
96 Phillips Joseph 0.148760 0.000000 0.000000 0.363636 0.016529 0.000000 0.000000 0.000000 0.471074 0.000000 121 4.942149
97 Garcia Davis 0.159292 0.000000 0.008850 0.353982 0.123894 0.336283 0.000000 0.000000 0.000000 0.017699 113 4.610619
98 Robinson Phillips 0.094118 0.082353 0.035294 0.094118 0.152941 0.035294 0.129412 0.364706 0.011765 0.000000 85 4.494118
99 Wright Martin 0.022472 0.022472 0.011236 0.438202 0.000000 0.348315 0.011236 0.000000 0.146067 0.000000 89 4.337079

In [117]:
gby.skill.plot(kind = 'hist', bins=10)
print(gby.mean())


a          0.083726
c          0.244781
h          0.070689
i          0.097548
j          0.213523
o          0.045212
p          0.127928
r          0.031274
s          0.067798
x          0.017520
count    178.100000
skill      6.415586
dtype: float64