notebook.community

Edit and run



In [1]:

    
# imports libraries
import pickle						# import/export lists
import re 							# regular expression
import pandas as pd					# dataframes
import datetime						# dates



In [2]:

    
# opens raw data
with open ('../raw_data/data_story', 'rb') as fp:
    data_story = pickle.load(fp)



In [3]:

    
# sets current year
cyear = str(datetime.datetime.now().year)



In [4]:

    
# converts to dataframe
df = pd.DataFrame(data_story)
df.columns = ['storyid', 'userid', 'cat', 'title', 'summary', 'info', 'error']



In [5]:

    
# indexes online stories
isonline = df.userid != 'NA'



In [6]:

    
# finds userid from link
uid = re.compile("=(.*)$")
df.loc[isonline, 'userid'] = [uid.search(row).group(1) 
                              for row in df.loc[isonline, 'userid']]



In [7]:

    
# splits up category and fandom
df['media'] = 'NA' 
df['fandom'] = 'NA'
iscontained = [type(row) is list and len(row) == 2 for row in df['cat']]
iscrossover = [type(row) is list and len(row) == 1 for row in df['cat']]
df.loc[iscontained, 'media'] = [row[0] for row in df.loc[iscontained, 'cat']]
df.loc[iscontained, 'fandom'] = [row[1] for row in df.loc[iscontained, 'cat']]
df.loc[iscrossover, 'media'] = 'Crossover'
df.loc[iscrossover, 'fandom'] = [row[0] for row in df.loc[iscrossover, 'cat']]
del df['cat']



In [8]:

    
# splits info
df.loc[isonline, 'info'] = [row.split(' - ') for row in df.loc[isonline, 'info']]
info_columns = ['rated', 'language', 'genre', 'characters', 'chapters', 'words',
               'reviews', 'favs', 'follows', 'updated', 'published', 'status']
for info_col in info_columns:
    df[info_col] = 'NA'



In [9]:

    
# retrieves rated
df.loc[isonline, 'rated'] = [re.sub('Rated: Fiction  ', '', row[0])
                             for row in df.loc[isonline, 'info']]
df.loc[isonline, 'info'] = [row[1:] for row in df.loc[isonline, 'info']]



In [10]:

    
# retrieves language
df.loc[isonline, 'language'] = [re.sub(' ', '', row[0]) for row in df.loc[isonline, 'info']]
df.loc[isonline, 'info'] = [row[1:] for row in df.loc[isonline, 'info']]



In [11]:

    
# retrieves genre
df.loc[isonline, 'genre'] = 'General'
isgenre = [type(row) is list and ': ' not in row[0] and row[0][0] != ' ' 
           for row in df['info']]
df.loc[isgenre, 'genre'] = [row[0] for row in df.loc[isgenre, 'info']]
df.loc[isgenre, 'info'] = [row[1:] for row in df.loc[isgenre, 'info']]

# splits genres
df['genre'] = [re.sub('Hurt/Comfort', 'Hurt-Comfort', row)
                      for row in df['genre']]
df['genre'] = [re.sub(' ', '', row)
                      for row in df['genre']]
df['genre'] = [row.split('/') for row in df['genre']]



In [12]:

    
# retrieves characters
ischaracter = [type(row) is list 
               and 'Chapters: ' not in row[0] and 'Words: ' not in row[0]
               for row in df['info']]
df.loc[ischaracter, 'characters'] = [row[0] for row in df.loc[ischaracter, 'info']]
df.loc[ischaracter, 'info'] = [row[1:] for row in df.loc[ischaracter, 'info']]

# appends characters in case splitted
while sum(ischaracter) > 0:
    ischaracter = [type(row) is list 
                   and 'Chapters: ' not in row[0] and 'Words: ' not in row[0]
                   for row in df['info']]
    a = df.loc[ischaracter, 'characters']
    b = [row[0] for row in df.loc[ischaracter, 'info']]
    df.loc[ischaracter, 'characters'] = [m+n for m,n in zip(a,b)]
    df.loc[ischaracter, 'info'] = [row[1:]
                                   for row in df.loc[ischaracter, 'info']]



In [13]:

    
# retrieves chapters
ischapter = [type(row) is list and 'Chapters' in row[0] for row in df['info']]
df.loc[ischapter, 'chapters'] = [re.sub("\D", "", row[0])
                            for row in df.loc[ischapter, 'info']]
df.loc[ischapter, 'info'] = [row[1:]
                            for row in df.loc[ischapter, 'info']]



In [14]:

    
# retrieves words
iswords = [type(row) is list for row in df['info']]
df.loc[iswords, 'words'] = [re.sub("\D", "", row[0])
                            for row in df.loc[iswords, 'info']]
df.loc[iswords, 'info'] = [row[1:]
                            for row in df.loc[iswords, 'info']]



In [15]:

    
# retrieves reviews
isreviews = [type(row) is list and 'Reviews' in row[0] for row in df['info']]
df.loc[isreviews, 'reviews'] = [re.sub("\D", "", row[0])
                            for row in df.loc[isreviews, 'info']]
df.loc[isreviews, 'info'] = [row[1:]
                            for row in df.loc[isreviews, 'info']]



In [16]:

    
# retrieves favs
isfavs = [type(row) is list and 'Favs' in row[0] for row in df['info']]
df.loc[isfavs, 'favs'] = [re.sub("\D", "", row[0])
                            for row in df.loc[isfavs, 'info']]
df.loc[isfavs, 'info'] = [row[1:]
                            for row in df.loc[isfavs, 'info']]



In [17]:

    
# retrieves follows
isfollows = [type(row) is list and 'Follows' in row[0] for row in df['info']]
df.loc[isfollows, 'follows'] = [re.sub("\D", "", row[0])
                            for row in df.loc[isfollows, 'info']]
df.loc[isfollows, 'info'] = [row[1:]
                            for row in df.loc[isfollows, 'info']]



In [18]:

    
# retrieves updated
isupdated = [type(row) is list and 'Updated' in row[0] for row in df['info']]
df.loc[isupdated, 'updated'] = [re.sub('Updated: ', '', row[0])
                            for row in df.loc[isupdated, 'info']]
df.loc[isupdated, 'info'] = [row[1:]
                            for row in df.loc[isupdated, 'info']]



In [19]:

    
# retrieves published
ispublished = [type(row) is list and 'Published' in row[0] for row in df['info']]
df.loc[ispublished, 'published'] = [re.sub('Published: ', '', row[0])
                            for row in df.loc[ispublished, 'info']]
df.loc[ispublished, 'info'] = [row[1:]
                            for row in df.loc[ispublished, 'info']]



In [20]:

    
# retrieves status
isstatus = [type(row) is list and 'Status' in row[0] for row in df['info']]
df.loc[df.userid != 'NA', 'status'] = 'Incomplete'
df.loc[isstatus, 'status'] = [re.sub('Status: ', '', row[0])
                              for row in df.loc[isstatus, 'info']]



In [21]:

    
del df['info']



In [22]:

    
# format numeric types
intcols = ['chapters', 'words', 'reviews', 'favs', 'follows']
for intcol in intcols:
    df[intcol] = pd.to_numeric(df[intcol], errors = 'coerce')



In [23]:

    
# formats published dates
pub_date = df.loc[df.published != 'NA', 'published']
pub_date = [row.split('/') for row in pub_date]
df.loc[df.published != 'NA', 'published'] = pub_date
for row in df.loc[df.published != 'NA', 'published']:
    if len(row) == 2:
        row.append(cyear)



In [24]:

    
# formats updated dates
upd_date = df.loc[df.updated != 'NA', 'updated']
upd_date = [row.split('/') for row in upd_date]
df.loc[df.updated != 'NA', 'updated'] = upd_date
for row in df.loc[df.updated != 'NA', 'updated']:
    if len(row) == 2:
        row.append(cyear)



In [25]:

    
# finds current state of story
df['state'] = 'online'
df.loc[df.userid == 'NA', 'state'] = 'deleted'
df.loc[df.error != 'NA', 'state'] = 'missing'

del df['error']



In [26]:

    
# saves dataframe
df.to_pickle("../clean_data/df_story")