In [1]:
# imports libraries
import pickle # import/export lists
import re # regular expression
import pandas as pd # dataframes
import datetime # dates
In [2]:
# opens raw data
with open ('../raw_data/data_story', 'rb') as fp:
data_story = pickle.load(fp)
In [3]:
# sets current year
cyear = str(datetime.datetime.now().year)
In [4]:
# converts to dataframe
df = pd.DataFrame(data_story)
df.columns = ['storyid', 'userid', 'cat', 'title', 'summary', 'info', 'error']
In [5]:
# indexes online stories
isonline = df.userid != 'NA'
In [6]:
# finds userid from link
uid = re.compile("=(.*)$")
df.loc[isonline, 'userid'] = [uid.search(row).group(1)
for row in df.loc[isonline, 'userid']]
In [7]:
# splits up category and fandom
df['media'] = 'NA'
df['fandom'] = 'NA'
iscontained = [type(row) is list and len(row) == 2 for row in df['cat']]
iscrossover = [type(row) is list and len(row) == 1 for row in df['cat']]
df.loc[iscontained, 'media'] = [row[0] for row in df.loc[iscontained, 'cat']]
df.loc[iscontained, 'fandom'] = [row[1] for row in df.loc[iscontained, 'cat']]
df.loc[iscrossover, 'media'] = 'Crossover'
df.loc[iscrossover, 'fandom'] = [row[0] for row in df.loc[iscrossover, 'cat']]
del df['cat']
In [8]:
# splits info
df.loc[isonline, 'info'] = [row.split(' - ') for row in df.loc[isonline, 'info']]
info_columns = ['rated', 'language', 'genre', 'characters', 'chapters', 'words',
'reviews', 'favs', 'follows', 'updated', 'published', 'status']
for info_col in info_columns:
df[info_col] = 'NA'
In [9]:
# retrieves rated
df.loc[isonline, 'rated'] = [re.sub('Rated: Fiction ', '', row[0])
for row in df.loc[isonline, 'info']]
df.loc[isonline, 'info'] = [row[1:] for row in df.loc[isonline, 'info']]
In [10]:
# retrieves language
df.loc[isonline, 'language'] = [re.sub(' ', '', row[0]) for row in df.loc[isonline, 'info']]
df.loc[isonline, 'info'] = [row[1:] for row in df.loc[isonline, 'info']]
In [11]:
# retrieves genre
df.loc[isonline, 'genre'] = 'General'
isgenre = [type(row) is list and ': ' not in row[0] and row[0][0] != ' '
for row in df['info']]
df.loc[isgenre, 'genre'] = [row[0] for row in df.loc[isgenre, 'info']]
df.loc[isgenre, 'info'] = [row[1:] for row in df.loc[isgenre, 'info']]
# splits genres
df['genre'] = [re.sub('Hurt/Comfort', 'Hurt-Comfort', row)
for row in df['genre']]
df['genre'] = [re.sub(' ', '', row)
for row in df['genre']]
df['genre'] = [row.split('/') for row in df['genre']]
In [12]:
# retrieves characters
ischaracter = [type(row) is list
and 'Chapters: ' not in row[0] and 'Words: ' not in row[0]
for row in df['info']]
df.loc[ischaracter, 'characters'] = [row[0] for row in df.loc[ischaracter, 'info']]
df.loc[ischaracter, 'info'] = [row[1:] for row in df.loc[ischaracter, 'info']]
# appends characters in case splitted
while sum(ischaracter) > 0:
ischaracter = [type(row) is list
and 'Chapters: ' not in row[0] and 'Words: ' not in row[0]
for row in df['info']]
a = df.loc[ischaracter, 'characters']
b = [row[0] for row in df.loc[ischaracter, 'info']]
df.loc[ischaracter, 'characters'] = [m+n for m,n in zip(a,b)]
df.loc[ischaracter, 'info'] = [row[1:]
for row in df.loc[ischaracter, 'info']]
In [13]:
# retrieves chapters
ischapter = [type(row) is list and 'Chapters' in row[0] for row in df['info']]
df.loc[ischapter, 'chapters'] = [re.sub("\D", "", row[0])
for row in df.loc[ischapter, 'info']]
df.loc[ischapter, 'info'] = [row[1:]
for row in df.loc[ischapter, 'info']]
In [14]:
# retrieves words
iswords = [type(row) is list for row in df['info']]
df.loc[iswords, 'words'] = [re.sub("\D", "", row[0])
for row in df.loc[iswords, 'info']]
df.loc[iswords, 'info'] = [row[1:]
for row in df.loc[iswords, 'info']]
In [15]:
# retrieves reviews
isreviews = [type(row) is list and 'Reviews' in row[0] for row in df['info']]
df.loc[isreviews, 'reviews'] = [re.sub("\D", "", row[0])
for row in df.loc[isreviews, 'info']]
df.loc[isreviews, 'info'] = [row[1:]
for row in df.loc[isreviews, 'info']]
In [16]:
# retrieves favs
isfavs = [type(row) is list and 'Favs' in row[0] for row in df['info']]
df.loc[isfavs, 'favs'] = [re.sub("\D", "", row[0])
for row in df.loc[isfavs, 'info']]
df.loc[isfavs, 'info'] = [row[1:]
for row in df.loc[isfavs, 'info']]
In [17]:
# retrieves follows
isfollows = [type(row) is list and 'Follows' in row[0] for row in df['info']]
df.loc[isfollows, 'follows'] = [re.sub("\D", "", row[0])
for row in df.loc[isfollows, 'info']]
df.loc[isfollows, 'info'] = [row[1:]
for row in df.loc[isfollows, 'info']]
In [18]:
# retrieves updated
isupdated = [type(row) is list and 'Updated' in row[0] for row in df['info']]
df.loc[isupdated, 'updated'] = [re.sub('Updated: ', '', row[0])
for row in df.loc[isupdated, 'info']]
df.loc[isupdated, 'info'] = [row[1:]
for row in df.loc[isupdated, 'info']]
In [19]:
# retrieves published
ispublished = [type(row) is list and 'Published' in row[0] for row in df['info']]
df.loc[ispublished, 'published'] = [re.sub('Published: ', '', row[0])
for row in df.loc[ispublished, 'info']]
df.loc[ispublished, 'info'] = [row[1:]
for row in df.loc[ispublished, 'info']]
In [20]:
# retrieves status
isstatus = [type(row) is list and 'Status' in row[0] for row in df['info']]
df.loc[df.userid != 'NA', 'status'] = 'Incomplete'
df.loc[isstatus, 'status'] = [re.sub('Status: ', '', row[0])
for row in df.loc[isstatus, 'info']]
In [21]:
del df['info']
In [22]:
# format numeric types
intcols = ['chapters', 'words', 'reviews', 'favs', 'follows']
for intcol in intcols:
df[intcol] = pd.to_numeric(df[intcol], errors = 'coerce')
In [23]:
# formats published dates
pub_date = df.loc[df.published != 'NA', 'published']
pub_date = [row.split('/') for row in pub_date]
df.loc[df.published != 'NA', 'published'] = pub_date
for row in df.loc[df.published != 'NA', 'published']:
if len(row) == 2:
row.append(cyear)
In [24]:
# formats updated dates
upd_date = df.loc[df.updated != 'NA', 'updated']
upd_date = [row.split('/') for row in upd_date]
df.loc[df.updated != 'NA', 'updated'] = upd_date
for row in df.loc[df.updated != 'NA', 'updated']:
if len(row) == 2:
row.append(cyear)
In [25]:
# finds current state of story
df['state'] = 'online'
df.loc[df.userid == 'NA', 'state'] = 'deleted'
df.loc[df.error != 'NA', 'state'] = 'missing'
del df['error']
In [26]:
# saves dataframe
df.to_pickle("../clean_data/df_story")