notebook.community

Edit and run



In [1]:

    
# imports libraries
import pickle						# import/export lists
import re 							# regular expression
import math							# mathematical functions
import pandas as pd					# dataframes



In [2]:

    
# opens raw data
with open ('data_profile', 'rb') as fp:
    data_profile = pickle.load(fp)



In [3]:

    
# converts to dataframe
df = pd.DataFrame(data_profile)
df.columns = ['id', 'desc', 'country', 'join_date', 'tabs']



In [4]:

    
# splits tabs into individual columns
tabs = df['tabs'].apply(pd.Series).fillna('0')
tabs = tabs.apply(pd.to_numeric)
tabs.columns = [name[1:] for name in tabs.columns]
df = df.join(tabs)
del df['tabs']



In [5]:

    
# parses description column
df['status'] = 'inactive'
df.loc[['reader' in row for row in df['desc']], 'status'] = 'reader'
df.loc[['author' in row for row in df['desc']], 'status'] = 'author'
del df['desc']



In [6]:

    
# parses date column
df['join'] = [re.split(r'[-/]+', row) for row in df['join_date']]
valid = [row[0] != 'NA' for row in df['join']]

df['join_month'] = 'NA'
df.loc[valid, 'join_month'] = [row[0] for row in df['join'][valid]]

df['join_year'] = 'NA'
df.loc[valid, 'join_year'] = [row[2] for row in df['join'][valid]]
old_time = [len(row) == 4 for row in df.join_year]
df.loc[old_time, 'join_year'] = [row[2:] for row in df.loc[old_time, 'join_year']]

del df['join']



In [7]:

    
# saves dataframe
df.to_pickle("df_profile")