In [ ]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
Privacy restriction:
Original (personal) dataset not in Repo. Provide self collected features and ratings about attended events in "../data/events.csv" (schema info in REFS.md)
In [ ]:
data_f = "../data/events.csv"
raw_data = pd.read_csv(data_f)
print(raw_data.shape)
print(raw_data.dtypes)
raw_data.head()
In [ ]:
raw_data.date = pd.to_datetime(raw_data.date, format="%d.%m.%Y")
raw_data.start_time = pd.to_datetime(raw_data.start_time, format="%H:%M").dt.time
raw_data.duration = pd.to_datetime(raw_data.duration, format="%H:%M").dt.time
raw_data["weekday"] = raw_data.date.dt.weekday_name
raw_data.ticket_prize = raw_data.ticket_prize.replace("[\€,]", "", regex=True).astype(int)
Buzzwords
In [ ]:
# Generate Buzzword List
buzzword_list = []
def parse_buzzword_file(file_path, buzzword_list):
"""
parses textfile and appends word to buzzword list
"""
with open(file_path, "r") as f:
content = f.readlines()
for buzzword in content:
buzzword_list.append(buzzword.strip("\n").lower())
return buzzword_list
buzzwords_wiki = parse_buzzword_file("../data/buzzwords_wiki.txt", buzzword_list)
buzzwords_de = parse_buzzword_file("../data/buzzwords_de.txt", buzzword_list)
buzzwords_personal = parse_buzzword_file("../data/buzzwords_personal.txt", buzzword_list)
print("Buzzword Count:", len(buzzword_list))
In [ ]:
# Pickling Buzzword List for App
file_path = "../data/buzzword_list.pkl"
with open(file_path, "wb") as f:
pickle.dump(buzzword_list, f)
In [ ]:
# Create binary colunms (buzzwordy_title, buzzwordy_organizer)
buzzword_mask_title = raw_data.title.str.contains("|".join(buzzword_list), case=False)
buzzword_mask_organizer = raw_data.organizer.str.contains("|".join(buzzword_list), case=False)
raw_data["buzzwordy_title"] = np.where(buzzword_mask_title, 1, 0)
raw_data["buzzwordy_organizer"] = np.where(buzzword_mask_organizer, 1, 0)
Checking
In [ ]:
print(raw_data.shape)
print(raw_data.dtypes)
raw_data.head()
Output
In [ ]:
# Pickling DF
file_path = "../data/events_df.pkl"
raw_data.to_pickle(file_path)