In [117]:
import pandas as pd
import numpy as np

In [118]:
submissions_df = pd.read_csv("/Users/frankkelly/Downloads/collated - Tabellenblatt1.csv")
submissions_df.head()


Out[118]:
ID Rating Category Level Coolness, attractiveness pycon long slot ID.1 Grade Category.1 ... Talk/Workshop ID.2 Type Rating: [a, b, c, d] cat level Coolness [+, ++, +++] Attractiveness: [+, ++, +++] Suggest to Pycon: [yes, no] Long slot: [yes, no]
0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 0B9B D - beginner + yes no 0B9B D general ... Talk NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 0B9D C general beginner ++ no no 0B9D B general ... Talk NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 0B9F C visualisation intermediate + no no 0B9F B Visualisation/Deployment ... Talk 0B9F talk. c visualisation intermediate +++ +++ yes no
4 0BA2 A general advanced +++ no no 0BA2 A process/pipelines ... Workshop 0BA2 workshop. b deep learning beginner + + no no

5 rows × 25 columns


In [119]:
submissions_df = submissions_df.loc[1:,:]

In [120]:
submissions_df.head()


Out[120]:
ID Rating Category Level Coolness, attractiveness pycon long slot ID.1 Grade Category.1 ... Talk/Workshop ID.2 Type Rating: [a, b, c, d] cat level Coolness [+, ++, +++] Attractiveness: [+, ++, +++] Suggest to Pycon: [yes, no] Long slot: [yes, no]
1 0B9B D - beginner + yes no 0B9B D general ... Talk NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 0B9D C general beginner ++ no no 0B9D B general ... Talk NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 0B9F C visualisation intermediate + no no 0B9F B Visualisation/Deployment ... Talk 0B9F talk. c visualisation intermediate +++ +++ yes no
4 0BA2 A general advanced +++ no no 0BA2 A process/pipelines ... Workshop 0BA2 workshop. b deep learning beginner + + no no
5 0BA5 C general beginner +++ yes no 0BA5 B visualisation ... Talk NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 25 columns


In [121]:
submissions_df.columns


Out[121]:
Index(['ID', 'Rating', 'Category', 'Level', 'Coolness, attractiveness',
       'pycon', 'long slot', 'ID.1', 'Grade', 'Category.1', 'Level.1',
       'Data Track Y/N', 'Coolness', 'Suggest to Pycon', 'Long Slot',
       'Talk/Workshop', 'ID.2', 'Type', 'Rating: [a, b, c, d]', 'cat', 'level',
       'Coolness [+, ++, +++]', 'Attractiveness: [+, ++, +++]',
       'Suggest to Pycon: [yes, no]', 'Long slot: [yes, no]'],
      dtype='object')

In [122]:
grade_df = submissions_df[["ID", "Rating", "Grade", "Rating: [a, b, c, d]"]]
grade_df.head()


Out[122]:
ID Rating Grade Rating: [a, b, c, d]
1 0B9B D D NaN
2 0B9D C B NaN
3 0B9F C B c
4 0BA2 A A b
5 0BA5 C B NaN

In [123]:
pure_cool_df = submissions_df[["Coolness, attractiveness", "Coolness", "Coolness [+, ++, +++]"]]
pure_cool_df.head()


Out[123]:
Coolness, attractiveness Coolness Coolness [+, ++, +++]
1 + + NaN
2 ++ ++ NaN
3 + ++ +++
4 +++ +++ +
5 +++ +++ (but PyCon might want her) NaN

In [124]:
pure_cool_df


Out[124]:
Coolness, attractiveness Coolness Coolness [+, ++, +++]
1 + + NaN
2 ++ ++ NaN
3 + ++ +++
4 +++ +++ +
5 +++ +++ (but PyCon might want her) NaN
6 + + NaN
7 ++ + ++
8 + +++ +++
9 ++ ++ NaN
10 +++ ++ ++
11 ++ NaN NaN
12 +++ +++ ++
13 ++ +++ NaN
14 + + ++
15 + +++ NaN
16 + NaN NaN
17 + ++ +
18 +++ ++ NaN
19 +++ +++ NaN
20 + + NaN
21 + ++ +++
22 +++ +++ +++
23 + + NaN
24 + +++ NaN
25 ++ ++ NaN
26 ++ ++ NaN
27 +++ ++ NaN
28 ++ NaN NaN
29 ++ NaN ++
30 + +++ +
... ... ... ...
58 ++ ++ NaN
59 ++ ++ +
60 +++ ++ +++
61 + NaN NaN
62 ++ ++ NaN
63 ++ ++ NaN
64 + NaN NaN
65 + +++ +++
66 + +++ ++
67 ++ + NaN
68 ++ ++ ++
69 + NaN NaN
70 +++ + +++
71 + ++ NaN
72 ++ +++ +++
73 + + +
74 + NaN NaN
75 +++ +++ +++
76 + ++ NaN
77 + ++ NaN
78 +++ +++ NaN
79 + + +
80 ++ NaN ++
81 ++ NaN NaN
82 ++ +++ NaN
83 + NaN ++
84 ++ + +
85 + NaN NaN
86 ++ NaN NaN
87 ++ NaN NaN

87 rows × 3 columns


In [125]:
cool_dict = {}
cool_dict["+"] = 1
cool_dict["++"] = 2
cool_dict["+++"] = 3
cool_dict


Out[125]:
{'+': 1, '++': 2, '+++': 3}

In [126]:
# def cool_series_convert(series_in):
#      return [cool_dict[x] if x in cool_dict.keys() else np.nan for x in series_in.values]

In [127]:
def get_plus(series_in):
    return [len(''.join([x for x in val if x == '+'])) if val is not np.nan else np.nan for val in series_in.values ]

In [128]:
pure_cool_df.iloc[1]


Out[128]:
Coolness, attractiveness     ++
Coolness                     ++
Coolness [+, ++, +++]       NaN
Name: 2, dtype: object

In [129]:
get_plus(pure_cool_df.iloc[1])


Out[129]:
[2, 2, nan]

In [130]:
cool_df = pure_cool_df.apply(lambda x: get_plus(x), axis=1)
cool_df.head()


Out[130]:
Coolness, attractiveness Coolness Coolness [+, ++, +++]
1 1.0 1.0 NaN
2 2.0 2.0 NaN
3 1.0 2.0 3.0
4 3.0 3.0 1.0
5 3.0 3.0 NaN

In [131]:
pure_grade_df = grade_df[["Rating", "Grade", "Rating: [a, b, c, d]"]].apply(lambda x: x.str.lower(), axis=1)
pure_grade_df.head()


Out[131]:
Rating Grade Rating: [a, b, c, d]
1 d d NaN
2 c b NaN
3 c b c
4 a a b
5 c b NaN

In [132]:
scoring_dict = {}
scoring_dict["a"] = 4
scoring_dict["b"] = 3
scoring_dict["c"] = 2
scoring_dict["d"] = 1
scoring_dict


Out[132]:
{'a': 4, 'b': 3, 'c': 2, 'd': 1}

In [133]:
# def series_convert(series_in):
#     list_out = []
#     for x in series_in:
#         if x is not np.nan:
#             list_out.append(scoring_dict[y])
#         else:
#             list_out.append(0)
        
#     return list_out

def series_convert(series_in):
     return [scoring_dict[x] if x in scoring_dict.keys() else np.nan for x in series_in.values]

In [134]:
pure_grade_df.iloc[0]


Out[134]:
Rating                    d
Grade                     d
Rating: [a, b, c, d]    NaN
Name: 1, dtype: object

In [135]:
pure_grade_numerical_df = pure_grade_df.apply(lambda x: series_convert(x), axis=1)

In [136]:
grade_column = pure_grade_numerical_df.mean(axis=1)
print(grade_column[:5])
coolness_column = cool_df.mean(axis=1)


1    1.000000
2    2.500000
3    2.333333
4    3.666667
5    2.500000
dtype: float64

In [137]:
score_column = (grade_column + coolness_column)/2
score_column


Out[137]:
1     1.000000
2     2.250000
3     2.166667
4     3.000000
5     2.750000
6     1.000000
7     2.083333
8     2.500000
9     2.500000
10    2.666667
11    2.500000
12    3.000000
13    3.000000
14    1.666667
15    2.750000
16    1.000000
17    1.833333
18    2.500000
19    3.000000
20    1.250000
21    2.166667
22    3.166667
23    2.000000
24    2.500000
25    2.750000
26    2.250000
27    3.250000
28    2.500000
29    2.250000
30    2.333333
        ...   
58    2.500000
59    2.500000
60    2.500000
61    1.500000
62    2.500000
63    2.750000
64    2.000000
65    2.500000
66    2.166667
67    2.000000
68    2.750000
69    1.000000
70    2.166667
71    2.250000
72    2.833333
73    2.000000
74    1.500000
75    3.000000
76    2.500000
77    2.000000
78    3.000000
79    2.166667
80    2.000000
81    1.500000
82    2.500000
83    1.750000
84    2.000000
85    1.000000
86    3.000000
87    2.500000
Length: 87, dtype: float64

In [138]:
submissions_df.columns


Out[138]:
Index(['ID', 'Rating', 'Category', 'Level', 'Coolness, attractiveness',
       'pycon', 'long slot', 'ID.1', 'Grade', 'Category.1', 'Level.1',
       'Data Track Y/N', 'Coolness', 'Suggest to Pycon', 'Long Slot',
       'Talk/Workshop', 'ID.2', 'Type', 'Rating: [a, b, c, d]', 'cat', 'level',
       'Coolness [+, ++, +++]', 'Attractiveness: [+, ++, +++]',
       'Suggest to Pycon: [yes, no]', 'Long slot: [yes, no]'],
      dtype='object')

In [180]:
def remove_nan(list_in):
    list_listin = list(list_in)
    for item in list_listin:
        if item is np.nan:
            list_listin.remove(item)
            
    return list_listin

In [181]:
remove_nan(['no', np.nan,  np.nan])


Out[181]:
['no', nan]

In [218]:
mode = lambda x: x.str.lower().mode()[0] if len(x) > 2 else str(x.values)

In [219]:
category_column = submissions_df[['Category', 'Category.1', 'cat']].apply(lambda x:str(x.values), axis=1)

level_column = submissions_df[['Level', 'Level.1', 'level']].apply(mode, axis=1)

pycon_column = submissions_df[['pycon', 'Suggest to Pycon', 'Suggest to Pycon: [yes, no]']]\
.apply(mode, axis=1)

long_slot_column = submissions_df[['long slot', 'Long Slot', 'Long slot: [yes, no]']]\
.apply(mode, axis=1)

print(long_slot_column[:5])
print(pycon_column[:5])
print(level_column[:5])


1    no
2    no
3    no
4    no
5    no
dtype: object
1    yes
2     no
3     no
4     no
5    yes
dtype: object
1        beginner
2             all
3    intermediate
4        beginner
5        beginner
dtype: object

In [217]:
pd.Series(['beginner', 'Beginner', np.nan]).str.lower().mode()


Out[217]:
0    beginner
dtype: object

In [220]:
final_df = pd.concat([submissions_df[["ID"]], category_column, \
                      level_column, score_column, grade_column, \
                      coolness_column, pycon_column, long_slot_column], axis=1)

In [221]:
final_df.columns=["ID", "category", "level", "score", "grade", "coolness", "pycon", "long-slot"]
final_df.head()


Out[221]:
ID category level score grade coolness pycon long-slot
1 0B9B ['-' 'general' nan] beginner 1.000000 1.000000 1.000000 yes no
2 0B9D ['general' 'general' nan] all 2.250000 2.500000 2.000000 no no
3 0B9F ['visualisation' 'Visualisation/Deployment' 'v... intermediate 2.166667 2.333333 2.000000 no no
4 0BA2 ['general' 'process/pipelines' 'deep learning'] beginner 3.000000 3.666667 2.333333 no no
5 0BA5 ['general' 'visualisation' nan] beginner 2.750000 2.500000 3.000000 yes no

In [222]:
top50_df = final_df.sort_values(by="score", ascending=False).head(50)
top50_df


Out[222]:
ID category level score grade coolness pycon long-slot
27 3048 ['visual' 'process/pipelines' nan] intermediate 3.250000 4.000000 2.500000 no yes
35 54C9 ['general' 'deep learning' 'deep learning'] all 3.166667 3.333333 3.000000 no no
22 303D ['deep learning' 'Machine Learning' 'deep lear... intermediate 3.166667 3.333333 3.000000 no no
19 3038 ['general' 'Bioinformatics' nan] beginner/intermediate 3.000000 3.000000 3.000000 no no
86 E723 ['general' nan nan] beginner 3.000000 4.000000 2.000000 no no
4 0BA2 ['general' 'process/pipelines' 'deep learning'] beginner 3.000000 3.666667 2.333333 no no
78 E70C ['general' 'process/pipelines' nan] beginner 3.000000 3.000000 3.000000 no no
75 E707 ['deep learning' 'deep learning' 'deep learning'] intermediate 3.000000 3.000000 3.000000 no no
12 0BB4 ['deep learning' 'deep learning' 'deep learning'] beginner 3.000000 3.333333 2.666667 no yes
13 0BB6 ['general' 'AI' nan] beginner 3.000000 3.500000 2.500000 yes no
39 54DB ['general' 'AI' 'process'] intermediate 2.833333 3.333333 2.333333 no no
72 C28E ['general' 'Machine Learning' 'deep learning'] intermediate 2.833333 3.000000 2.666667 no no
25 3045 ['NLP' 'NLP' nan] advanced 2.750000 3.500000 2.000000 no no
53 9DE8 ['process/pipelines' 'process/pipelines' nan] advanced 2.750000 3.500000 2.000000 no no
44 7959 ['NLP' 'NLP' nan] intermediate 2.750000 3.500000 2.000000 no no
63 C279 ['process/pipelines' 'general' nan] advanced 2.750000 3.500000 2.000000 no no
68 C27F ['deep learning' 'deep learning' 'deep learning'] intermediate 2.750000 3.500000 2.000000 no no
5 0BA5 ['general' 'visualisation' nan] beginner 2.750000 2.500000 3.000000 yes no
15 302F ['process/pipelines' 'process/pipelines' nan] intermediate 2.750000 3.500000 2.000000 yes no
41 7951 ['deep learning' 'deep learning' 'deep learning'] beginner 2.666667 3.333333 2.000000 no no
51 9DE3 ['general' 'Survival Analysis' 'deep learning'] intermediate 2.666667 3.333333 2.000000 no no
10 0BAF ['deep learning' 'Machine Learning' 'deep lear... advanced 2.666667 3.000000 2.333333 no no
58 9DF4 ['process/pipelines' 'process/pipelines' nan] beginner 2.500000 3.000000 2.000000 no no
33 54C4 ['process/pipelines' 'process/pipelines' nan] advanced 2.500000 3.500000 1.500000 no no
76 E70A ['process/pipelines' 'process/pipelines' nan] intermediate 2.500000 3.500000 1.500000 yes no
52 9DE6 ['general' 'general' 'general'] intermediate 2.500000 3.000000 2.000000 no no
82 E717 ['deep learning' 'deep learning' nan] beginner 2.500000 2.500000 2.500000 no no
59 9DF7 ['NLP' 'NLP' 'NLP'] beginner 2.500000 3.333333 1.666667 no no
60 9DF9 ['deep learning' 'Machine Learning' 'deep lear... advanced 2.500000 2.333333 2.666667 no no
62 9E00 ['visual' 'process/pipelines' nan] beginner 2.500000 3.000000 2.000000 no no
36 54CE ['process/pipelines' 'Machine Learning' 'gener... intermediate 2.500000 3.333333 1.666667 no no
65 C27B ['process/pipelines' 'general' 'visualisation'] intermediate 2.500000 2.666667 2.333333 no no
87 E725 ['general' nan nan] beginner 2.500000 3.000000 2.000000 y no
9 0BAB ['process/pipelines' 'process/pipelines' nan] beginner 2.500000 3.000000 2.000000 no no
8 0BAA ['NLP' 'NLP' 'NLP'] intermediate 2.500000 2.666667 2.333333 no no
31 54C2 ['NLP' 'general' nan] beginner 2.500000 2.500000 2.500000 no no
11 0BB0 ['general' nan nan] beginner 2.500000 3.000000 2.000000 no no
24 3041 ['-' 'general' nan] all 2.500000 3.000000 2.000000 yes no
28 54BE ['general' nan nan] beginner 2.500000 3.000000 2.000000 y no
18 3035 ['process/pipelines' 'process/pipelines' nan] beginner? 2.500000 2.500000 2.500000 no no
56 9DF2 ['NLP' 'NLP' 'NLP'] beginner 2.333333 3.000000 1.666667 no no
40 7950 ['process/pipelines' 'general' 'general'] beginner 2.333333 3.000000 1.666667 no no
30 54C1 ['process/pipelines' 'process/pipelines' 'gene... intermediate 2.333333 3.000000 1.666667 no no
71 C287 ['general' 'general' nan] beginner 2.250000 3.000000 1.500000 yes no
29 54C0 ['process/pipelines' nan 'process'] expert 2.250000 2.500000 2.000000 no no
26 3046 ['process/pipelines' 'process/pipelines' nan] intermediate 2.250000 2.500000 2.000000 no no
2 0B9D ['general' 'general' nan] all 2.250000 2.500000 2.000000 no no
70 C285 ['process/pipelines' 'Machine Learning' 'gener... intermediate 2.166667 2.000000 2.333333 no yes
66 C27D ['deep learning' 'Machine Learning' 'deep lear... beginner 2.166667 2.333333 2.000000 no no
46 7961 ['NLP' 'NLP' 'visualisation'] advanced 2.166667 2.333333 2.000000 no no

In [116]:
top30_df.to_csv("../data/top30entries.csv")

In [107]:
top21_df.level.value_counts()


Out[107]:
['intermediate' 'Intermediate' nan]                        3
['intermediate' 'Intermediate' 'intermediate']             2
['intermediate' 'Beginner/Intermediate' 'intermediate']    2
['intermediate' 'Advanced' nan]                            2
['intermediate' 'Beginner/Intermediate' nan]               1
['intermediate' 'Beginner' 'intermediate']                 1
['beginner' 'Beginner/Intermediate' nan]                   1
['intermediate' 'Beginner/Intermediate' 'beginner']        1
['beginner' 'Intermediate' nan]                            1
['intermediate' 'Intermediate/Advanced' 'intermediate']    1
['beginner' 'Advanced' nan]                                1
['beginner' 'Beginner/Intermediate' 'intermediate']        1
['beginner' nan nan]                                       1
['advanced' 'Beginner' 'beginner']                         1
['beginner' 'Intermediate/Advanced' nan]                   1
['intermediate' 'All' nan]                                 1
Name: level, dtype: int64

In [ ]: