In [1]:
# imports libraries
import pickle # import/export lists
import re # regular expression
import math # mathematical functions
import pandas as pd # dataframes
In [2]:
# opens raw data
with open ('data_profile', 'rb') as fp:
data_profile = pickle.load(fp)
In [3]:
# converts to dataframe
df = pd.DataFrame(data_profile)
df.columns = ['id', 'desc', 'country', 'join_date', 'tabs']
In [4]:
# splits tabs into individual columns
tabs = df['tabs'].apply(pd.Series).fillna('0')
tabs = tabs.apply(pd.to_numeric)
tabs.columns = [name[1:] for name in tabs.columns]
df = df.join(tabs)
del df['tabs']
In [5]:
# parses description column
df['status'] = 'inactive'
df.loc[['reader' in row for row in df['desc']], 'status'] = 'reader'
df.loc[['author' in row for row in df['desc']], 'status'] = 'author'
del df['desc']
In [6]:
# parses date column
df['join'] = [re.split(r'[-/]+', row) for row in df['join_date']]
valid = [row[0] != 'NA' for row in df['join']]
df['join_month'] = 'NA'
df.loc[valid, 'join_month'] = [row[0] for row in df['join'][valid]]
df['join_year'] = 'NA'
df.loc[valid, 'join_year'] = [row[2] for row in df['join'][valid]]
old_time = [len(row) == 4 for row in df.join_year]
df.loc[old_time, 'join_year'] = [row[2:] for row in df.loc[old_time, 'join_year']]
del df['join']
In [7]:
# saves dataframe
df.to_pickle("df_profile")