notebook.community

Edit and run



In [ ]:

    
import pandas as pd
import numpy as np
from utils import remove_unicode, remove_digits, remove_punctuation, remove_stopwords



In [ ]:

    
df = pd.read_table("tagged.tsv")



In [ ]:

    
df.head()



In [ ]:

    
text_cols = "title speaker_info section target_audience type prerequisites description".split()
pipe = [remove_unicode, remove_digits, remove_punctuation, remove_stopwords, lambda x: x.lower()]
for col in text_cols:
    s = df.pop(col)
    for cleaner in pipe:
        s = s.astype(str).apply(cleaner)
    df[col] = s



In [ ]:

    
df.head()



In [ ]:

    
df['speaker_link_present'] = False
df['content_url_present'] = False



In [ ]:

    
for col in df:
    if df[col].dtype is np.dtype('O'):
        df[col].fillna(value="", inplace=True)



In [ ]:

    
URL_PATTERN = "(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"
df.loc[df.speaker_links.str.contains(URL_PATTERN, case=False), "speaker_link_present"] = True
df.loc[df.content_urls.str.contains(URL_PATTERN, case=False), "content_url_present"] = True



In [ ]:

    
df.content_url_present.sum() / float(df.shape[0])



In [ ]:

    
del df['content_urls']
del df['speaker_links']



In [ ]:

    
df['last_updated'] = pd.to_datetime(df.last_updated, errors='raise')



In [ ]:

    
df.head()



In [ ]:

    
deadline_16 = pd.to_datetime("1 July 2016")
deadline_15 = pd.to_datetime("1 June 2015")



In [ ]:

    
df['deadlinediff'] = 0
df.loc[df.year == 2015, "deadlinediff"] = deadline_15 - df[df.year == 2015]['last_updated']
df.loc[df.year == 2016, "deadlinediff"] = deadline_16 - df[df.year == 2016]['last_updated']



In [ ]:

    
del df['last_updated']



In [ ]:

    
df.head()



In [ ]:

    
df['deadlinediff'] = df.deadlinediff.apply(lambda x: x.days)



In [ ]:

    
df.head()



In [ ]:

    
df.to_csv("preprocessed.tsv", index=False, sep="\t")