In [1]:
#!/usr/bin/env python
# -*- coding: latin-1 -*-
In [2]:
import pandas as pd
import pickle
import re
from library.clean import replace_string, lower_case_columns, rid_punctuation, slice_and_dice_col, str_to_date, stringify_data
In [4]:
# load data and clean data
l = pd.read_pickle('data/re_merge/consolidated_flat.pickle')
df = pd.DataFrame(l, columns=['nyt_id', 'date_pub', 'org', 'hl', 'lead'])
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
df = stringify_data(df)
pattern = re.compile("^([^-]*)-- *")
df['lead'] = [replace_string(x, pattern) for x in df['lead']]
df = rid_punctuation(df, ['date_pub', 'hl', 'lead'])
df = slice_and_dice_col(df, 'date_pub', 0, 8)
df = lower_case_columns(df)
df = str_to_date(df, 'date_pub')
df = df[df['lead'].str.contains('marijuana')]
df.to_pickle('data/re_merge/clean.pickle')
df.head(3)
Out[4]:
In [ ]: