In [ ]:
import pandas as pd
import numpy as np
from utils import remove_unicode, remove_digits, remove_punctuation, remove_stopwords
In [ ]:
df = pd.read_table("tagged.tsv")
In [ ]:
df.head()
In [ ]:
text_cols = "title speaker_info section target_audience type prerequisites description".split()
pipe = [remove_unicode, remove_digits, remove_punctuation, remove_stopwords, lambda x: x.lower()]
for col in text_cols:
s = df.pop(col)
for cleaner in pipe:
s = s.astype(str).apply(cleaner)
df[col] = s
In [ ]:
df.head()
In [ ]:
df['speaker_link_present'] = False
df['content_url_present'] = False
In [ ]:
for col in df:
if df[col].dtype is np.dtype('O'):
df[col].fillna(value="", inplace=True)
In [ ]:
URL_PATTERN = "(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"
df.loc[df.speaker_links.str.contains(URL_PATTERN, case=False), "speaker_link_present"] = True
df.loc[df.content_urls.str.contains(URL_PATTERN, case=False), "content_url_present"] = True
In [ ]:
df.content_url_present.sum() / float(df.shape[0])
In [ ]:
del df['content_urls']
del df['speaker_links']
In [ ]:
df['last_updated'] = pd.to_datetime(df.last_updated, errors='raise')
In [ ]:
df.head()
In [ ]:
deadline_16 = pd.to_datetime("1 July 2016")
deadline_15 = pd.to_datetime("1 June 2015")
In [ ]:
df['deadlinediff'] = 0
df.loc[df.year == 2015, "deadlinediff"] = deadline_15 - df[df.year == 2015]['last_updated']
df.loc[df.year == 2016, "deadlinediff"] = deadline_16 - df[df.year == 2016]['last_updated']
In [ ]:
del df['last_updated']
In [ ]:
df.head()
In [ ]:
df['deadlinediff'] = df.deadlinediff.apply(lambda x: x.days)
In [ ]:
df.head()
In [ ]:
df.to_csv("preprocessed.tsv", index=False, sep="\t")