What we did last time

Specified the data type of a column
Parsed timestamps into Pythonic timestamps
Dropped outliers from a numerical array
Checked text for valid email addresses



In [1]:

    
from pysemantic import Project
import numpy as np



In [2]:

    
demo = Project("demo")



In [3]:

    
data = demo.load_dataset("dummy_data")



In [4]:

    
data.head()









    Out[4]:






  
    
      
      date
      zip
      X
      email
    
  
  
    
      0
      2015-02-21 01:05:03
      13611
      5.014501
      jeff.dasovich@enron.com
    
    
      2
      2015-02-27 10:16:34
      02888
      8.918459
      karen.denne@enron.com
    
    
      3
      2015-02-20 19:11:00
      07827
      5.664665
      enron-owner@lists.qgadc.com
    
    
      4
      2015-02-21 13:20:11
      23887
      6.159554
      jeff.dasovich@enron.com
    
    
      5
      2015-02-22 04:17:01
      35461
      5.618556
      jeff.dasovich@enron.com



In [5]:

    
for column in data:
    print column, data[column].dtype









    



date datetime64[ns]
zip object
X float64
email object



In [6]:

    
zip_lengths = data['zip'].apply(len)
print np.any(zip_lengths != 5)









    



False



In [7]:

    
print data['X'].max()









    



8.99728774235

	date	zip	X	email
0	2015-02-21 01:05:03	13611	5.014501	jeff.dasovich@enron.com
2	2015-02-27 10:16:34	02888	8.918459	karen.denne@enron.com
3	2015-02-20 19:11:00	07827	5.664665	enron-owner@lists.qgadc.com
4	2015-02-21 13:20:11	23887	6.159554	jeff.dasovich@enron.com
5	2015-02-22 04:17:01	35461	5.618556	jeff.dasovich@enron.com