In [1]:
# imports libraries
import pickle # import/export lists
import re # regular expression
import math # mathematical functions
import pandas as pd # dataframes
In [2]:
# opens raw data
with open ('../raw_data/data_profile', 'rb') as fp:
data_profile = pickle.load(fp)
In [3]:
# converts to dataframe
df = pd.DataFrame(data_profile)
df.columns = ['id', 'desc', 'country', 'join_date', 'profile', 'tabs']
In [4]:
# splits tabs into individual columns
tabs = df['tabs'].apply(pd.Series).fillna('0')
tabs = tabs.apply(pd.to_numeric)
tabs.columns = [name[1:] for name in tabs.columns]
df = df.join(tabs)
del df['tabs']
In [5]:
# parses description column
df['status'] = 'inactive'
df.loc[['reader' in row for row in df['desc']], 'status'] = 'reader'
df.loc[['author' in row for row in df['desc']], 'status'] = 'author'
del df['desc']
In [6]:
# parses date column
df['join'] = [re.split(r'[-/]+', row) for row in df['join_date']]
df.loc[[len(row) == 1 and row[0] == 'NA' for row in df['join']], 'join'] = 'NA'
for row in df['join']:
if row != 'NA' and len(row[2]) == 2:
year = int(row[2]) + 2000
if year > 2090:
year = year - 1000
row[2] = str(year)
del df['join_date']
In [7]:
# parses profile
df['profile'] = [' '.join(row) for row in df['profile']]
In [8]:
# saves dataframe
df.to_pickle("../clean_data/df_profile")