notebook.community

Edit and run



In [1]:

    
#!/usr/bin/env python
# -*- coding: latin-1 -*-



In [2]:

    
import pandas as pd
import pickle
import re

from library.clean import replace_string, lower_case_columns, rid_punctuation, slice_and_dice_col, str_to_date, stringify_data



In [4]:

    
# load data and clean data

l = pd.read_pickle('data/re_merge/consolidated_flat.pickle')
df = pd.DataFrame(l, columns=['nyt_id', 'date_pub', 'org', 'hl', 'lead'])
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
df = stringify_data(df)

pattern = re.compile("^([^-]*)-- *")
df['lead'] = [replace_string(x, pattern) for x in df['lead']]

df = rid_punctuation(df, ['date_pub', 'hl', 'lead'])
df = slice_and_dice_col(df, 'date_pub', 0, 8)
df = lower_case_columns(df)
df = str_to_date(df, 'date_pub')
df = df[df['lead'].str.contains('marijuana')]
df.to_pickle('data/re_merge/clean.pickle')
df.head(3)









    Out[4]:






  
    
      
      nyt_id
      org
      hl
      lead
      date_pub
    
  
  
    
      3
      4fc0a09745c1498b0d3ba216
      none
      marijuana smoking is reported safe hemp leaves...
      a panaman judge recently sentenced an american...
      1926-11-21
    
    
      9
      4fc1d8e345c1498b0d4ccb9f
      none
      use of marijuana spreading in west poisonous w...
      although as appalling in its effects on the hu...
      1934-09-16
    
    
      12
      4fc1ebab45c1498b0d528e5b
      the associated press
      rhode island to end weed as drug source state ...
      providence ri jan 19 rhode island authorities ...
      1935-01-20



In [ ]:

	nyt_id	org	hl	lead	date_pub
3	4fc0a09745c1498b0d3ba216	none	marijuana smoking is reported safe hemp leaves...	a panaman judge recently sentenced an american...	1926-11-21
9	4fc1d8e345c1498b0d4ccb9f	none	use of marijuana spreading in west poisonous w...	although as appalling in its effects on the hu...	1934-09-16
12	4fc1ebab45c1498b0d528e5b	the associated press	rhode island to end weed as drug source state ...	providence ri jan 19 rhode island authorities ...	1935-01-20