In [1]:
#!/usr/bin/env python
# -*- coding: latin-1 -*-

In [2]:
import pandas as pd
import pickle
import re

from library.clean import replace_string, lower_case_columns, rid_punctuation, slice_and_dice_col, str_to_date, stringify_data

In [4]:
# load data and clean data

l = pd.read_pickle('data/re_merge/consolidated_flat.pickle')
df = pd.DataFrame(l, columns=['nyt_id', 'date_pub', 'org', 'hl', 'lead'])
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
df = stringify_data(df)

pattern = re.compile("^([^-]*)-- *")
df['lead'] = [replace_string(x, pattern) for x in df['lead']]

df = rid_punctuation(df, ['date_pub', 'hl', 'lead'])
df = slice_and_dice_col(df, 'date_pub', 0, 8)
df = lower_case_columns(df)
df = str_to_date(df, 'date_pub')
df = df[df['lead'].str.contains('marijuana')]
df.to_pickle('data/re_merge/clean.pickle')
df.head(3)


Out[4]:
nyt_id org hl lead date_pub
3 4fc0a09745c1498b0d3ba216 none marijuana smoking is reported safe hemp leaves... a panaman judge recently sentenced an american... 1926-11-21
9 4fc1d8e345c1498b0d4ccb9f none use of marijuana spreading in west poisonous w... although as appalling in its effects on the hu... 1934-09-16
12 4fc1ebab45c1498b0d528e5b the associated press rhode island to end weed as drug source state ... providence ri jan 19 rhode island authorities ... 1935-01-20

In [ ]: