In [1]:
# imports libraries
import pickle						# import/export lists
import re 							# regular expression
import math							# mathematical functions
import pandas as pd					# dataframes

In [2]:
# opens raw data
with open ('../raw_data/data_profile', 'rb') as fp:
    data_profile = pickle.load(fp)

In [3]:
# converts to dataframe
df = pd.DataFrame(data_profile)
df.columns = ['id', 'desc', 'country', 'join_date', 'profile', 'tabs']

In [4]:
# splits tabs into individual columns
tabs = df['tabs'].apply(pd.Series).fillna('0')
tabs = tabs.apply(pd.to_numeric)
tabs.columns = [name[1:] for name in tabs.columns]
df = df.join(tabs)
del df['tabs']

In [5]:
# parses description column
df['status'] = 'inactive'
df.loc[['reader' in row for row in df['desc']], 'status'] = 'reader'
df.loc[['author' in row for row in df['desc']], 'status'] = 'author'
del df['desc']

In [6]:
# parses date column
df['join'] = [re.split(r'[-/]+', row) for row in df['join_date']]
df.loc[[len(row) == 1 and row[0] == 'NA' for row in df['join']], 'join'] = 'NA'

for row in df['join']:
    if row != 'NA' and len(row[2]) == 2:
        year = int(row[2]) + 2000
        if year > 2090:
            year = year - 1000
        row[2] = str(year)
        
del df['join_date']

In [7]:
# parses profile
df['profile'] = [' '.join(row) for row in df['profile']]

In [8]:
# saves dataframe
df.to_pickle("../clean_data/df_profile")