In [1]:
import pandas as pd
import numpy as np
import os
import os.path as op
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
data = pd.read_csv("dummy_data.csv")
In [3]:
data.head()
Out[3]:
In [4]:
data = pd.read_csv("dummy_data.csv", dtype={'zip':str})
In [5]:
data.head()
Out[5]:
In [6]:
data = pd.read_csv("dummy_data.csv", dtype={'zip':str}, parse_dates=['date'])
In [7]:
print data['date'].dtype
In [8]:
dates = pd.to_datetime(data['date'], errors='raise')
In [9]:
bad_dates = data['date'].apply(lambda x: int(x.split('-')[1]) == 0)
data.drop(data.index[bad_dates], inplace=True)
data['date'] = pd.to_datetime(data.pop('date'))
print data['date'].dtype
In [10]:
plt.figure(figsize=(8,8))
data['X'].hist(bins=100)
Out[10]:
In [11]:
data = data[data['X'] <= 9.0]
In [12]:
plt.figure(figsize=(8,8))
data['X'].hist(bins=100)
Out[12]:
In [13]:
data['email'].head()
Out[13]:
In [14]:
email_pattern = r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"
In [15]:
data = data[data['email'].str.contains(email_pattern)]
In [16]:
data.head()
Out[16]: