notebook.community

Edit and run



In [1]:

    
# imports libraries
import pickle						# import/export lists
import re 							# regular expression
import math							# mathematical functions
import pandas as pd					# dataframes



In [2]:

    
# opens raw data
with open ('../raw_data/data_profile', 'rb') as fp:
    data_profile = pickle.load(fp)



In [3]:

    
# converts to dataframe
df = pd.DataFrame(data_profile)
df.columns = ['id', 'desc', 'country', 'join_date', 'profile', 'tabs']



In [4]:

    
# splits tabs into individual columns
tabs = df['tabs'].apply(pd.Series).fillna('0')
tabs = tabs.apply(pd.to_numeric)
tabs.columns = [name[1:] for name in tabs.columns]
df = df.join(tabs)
del df['tabs']



In [5]:

    
# parses description column
df['status'] = 'inactive'
df.loc[['reader' in row for row in df['desc']], 'status'] = 'reader'
df.loc[['author' in row for row in df['desc']], 'status'] = 'author'
del df['desc']



In [6]:

    
# parses date column
df['join'] = [re.split(r'[-/]+', row) for row in df['join_date']]
df.loc[[len(row) == 1 and row[0] == 'NA' for row in df['join']], 'join'] = 'NA'

for row in df['join']:
    if row != 'NA' and len(row[2]) == 2:
        year = int(row[2]) + 2000
        if year > 2090:
            year = year - 1000
        row[2] = str(year)
        
del df['join_date']



In [7]:

    
# parses profile
df['profile'] = [' '.join(row) for row in df['profile']]



In [8]:

    
# saves dataframe
df.to_pickle("../clean_data/df_profile")