In [117]:
import pandas as pd
import numpy as np
In [118]:
submissions_df = pd.read_csv("/Users/frankkelly/Downloads/collated - Tabellenblatt1.csv")
submissions_df.head()
Out[118]:
In [119]:
submissions_df = submissions_df.loc[1:,:]
In [120]:
submissions_df.head()
Out[120]:
In [121]:
submissions_df.columns
Out[121]:
In [122]:
grade_df = submissions_df[["ID", "Rating", "Grade", "Rating: [a, b, c, d]"]]
grade_df.head()
Out[122]:
In [123]:
pure_cool_df = submissions_df[["Coolness, attractiveness", "Coolness", "Coolness [+, ++, +++]"]]
pure_cool_df.head()
Out[123]:
In [124]:
pure_cool_df
Out[124]:
In [125]:
cool_dict = {}
cool_dict["+"] = 1
cool_dict["++"] = 2
cool_dict["+++"] = 3
cool_dict
Out[125]:
In [126]:
# def cool_series_convert(series_in):
# return [cool_dict[x] if x in cool_dict.keys() else np.nan for x in series_in.values]
In [127]:
def get_plus(series_in):
return [len(''.join([x for x in val if x == '+'])) if val is not np.nan else np.nan for val in series_in.values ]
In [128]:
pure_cool_df.iloc[1]
Out[128]:
In [129]:
get_plus(pure_cool_df.iloc[1])
Out[129]:
In [130]:
cool_df = pure_cool_df.apply(lambda x: get_plus(x), axis=1)
cool_df.head()
Out[130]:
In [131]:
pure_grade_df = grade_df[["Rating", "Grade", "Rating: [a, b, c, d]"]].apply(lambda x: x.str.lower(), axis=1)
pure_grade_df.head()
Out[131]:
In [132]:
scoring_dict = {}
scoring_dict["a"] = 4
scoring_dict["b"] = 3
scoring_dict["c"] = 2
scoring_dict["d"] = 1
scoring_dict
Out[132]:
In [133]:
# def series_convert(series_in):
# list_out = []
# for x in series_in:
# if x is not np.nan:
# list_out.append(scoring_dict[y])
# else:
# list_out.append(0)
# return list_out
def series_convert(series_in):
return [scoring_dict[x] if x in scoring_dict.keys() else np.nan for x in series_in.values]
In [134]:
pure_grade_df.iloc[0]
Out[134]:
In [135]:
pure_grade_numerical_df = pure_grade_df.apply(lambda x: series_convert(x), axis=1)
In [136]:
grade_column = pure_grade_numerical_df.mean(axis=1)
print(grade_column[:5])
coolness_column = cool_df.mean(axis=1)
In [137]:
score_column = (grade_column + coolness_column)/2
score_column
Out[137]:
In [138]:
submissions_df.columns
Out[138]:
In [180]:
def remove_nan(list_in):
list_listin = list(list_in)
for item in list_listin:
if item is np.nan:
list_listin.remove(item)
return list_listin
In [181]:
remove_nan(['no', np.nan, np.nan])
Out[181]:
In [218]:
mode = lambda x: x.str.lower().mode()[0] if len(x) > 2 else str(x.values)
In [219]:
category_column = submissions_df[['Category', 'Category.1', 'cat']].apply(lambda x:str(x.values), axis=1)
level_column = submissions_df[['Level', 'Level.1', 'level']].apply(mode, axis=1)
pycon_column = submissions_df[['pycon', 'Suggest to Pycon', 'Suggest to Pycon: [yes, no]']]\
.apply(mode, axis=1)
long_slot_column = submissions_df[['long slot', 'Long Slot', 'Long slot: [yes, no]']]\
.apply(mode, axis=1)
print(long_slot_column[:5])
print(pycon_column[:5])
print(level_column[:5])
In [217]:
pd.Series(['beginner', 'Beginner', np.nan]).str.lower().mode()
Out[217]:
In [220]:
final_df = pd.concat([submissions_df[["ID"]], category_column, \
level_column, score_column, grade_column, \
coolness_column, pycon_column, long_slot_column], axis=1)
In [221]:
final_df.columns=["ID", "category", "level", "score", "grade", "coolness", "pycon", "long-slot"]
final_df.head()
Out[221]:
In [222]:
top50_df = final_df.sort_values(by="score", ascending=False).head(50)
top50_df
Out[222]:
In [116]:
top30_df.to_csv("../data/top30entries.csv")
In [107]:
top21_df.level.value_counts()
Out[107]:
In [ ]: