In [ ]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
%matplotlib inline
Privacy restriction:
Original (personal) cleaned DF not in Repo. Go through nb "0_Cleaning" with self provided data to reproduce pickled DF of attended events ("events_df.pkl").
For further steps: Repo contains pickled DF for modeling (nb "3_Modeling"), in which private informations are elimated.
In [ ]:
file_path = "../data/events_df.pkl"
df = pd.read_pickle(file_path)
print(df.shape)
print(df.dtypes)
df.head()
In [ ]:
print("Stats (continuous Vars):")
print(df.describe())
print("")
print("NaN values count:")
print(df.isnull().sum())
In [ ]:
for col in df:
print(df[col].value_counts())
print("")
In [ ]:
df.groupby(df.main_topic).mean()[["distance", "rating"]]
In [ ]:
df.groupby(df.city).mean()[["distance", "rating"]]
Missing Values
In [ ]:
df_cleaned = df.fillna("missing") # Nan in String val Cols
print(df_cleaned.isnull().sum())
DFs for Modeling
In [ ]:
# Minimal Features Model
model01_cols = [u"main_topic", u"buzzwordy_title", u"buzzwordy_organizer", u"days", u"weekday", u"city",
u"country", u"distance", u"ticket_prize", u"rating"]
df_model01 = df_cleaned[model01_cols]
df_model01.head()
In [ ]:
df_model01 = pd.get_dummies(df_model01, prefix=["main_topic", "weekday", "city", "country"])
In [ ]:
def pickle_model(df_model, file_path):
"""
Pickles provided model DF for modeling step
"""
df_model.to_pickle(file_path)
pickle_model(df_model01, "../data/df_model01.pkl") # Model01
In [ ]: