In [1]:
# weather.csv file downloaded from https://www.kaggle.com/c/predict-west-nile-virus/data

%matplotlib inline
import sys
sys.path.append("../bin/")
from data import DataIn
import pandas as pd
from preprocess import summary

if __name__ == "__main__":
    weather = DataIn("weather.csv")
    weather.summarize()


No missing values in the columns of weather.csv!

--------------------------------------------------------------------------------
********************    Begin of the summary of text data   ********************
--------------------------------------------------------------------------------
count           2944
unique          1472
top       2011-08-18
freq               2
Name: Date, dtype: object


count     2944
unique      60
top         73
freq       138
Name: Tavg, dtype: object


count     2944
unique      42
top          M
freq      1472
Name: Depart, dtype: object


count     2944
unique      48
top         63
freq       135
Name: WetBulb, dtype: object


count     2944
unique      31
top          0
freq      1870
Name: Heat, dtype: object


count     2944
unique      31
top          0
freq      1147
Name: Cool, dtype: object


count     2944
unique     122
top          -
freq      1472
Name: Sunrise, dtype: object


count     2944
unique     119
top          -
freq      1472
Name: Sunset, dtype: object


count     2944
unique      98
top           
freq      1609
Name: CodeSum, dtype: object


count     2944
unique       2
top          0
freq      1472
Name: Depth, dtype: object


count     2944
unique       1
top          M
freq      2944
Name: Water1, dtype: object


count     2944
unique       4
top          M
freq      1472
Name: SnowFall, dtype: object


count     2944
unique     168
top       0.00
freq      1577
Name: PrecipTotal, dtype: object


count      2944
unique      104
top       29.34
freq        128
Name: StnPressure, dtype: object


count      2944
unique      102
top       30.00
freq         96
Name: SeaLevel, dtype: object


count     2944
unique     178
top        6.9
freq        63
Name: AvgSpeed, dtype: object


--------------------------------------------------------------------------------
********************    End of the summary of text data     ********************
--------------------------------------------------------------------------------

In [2]:
# Convert the data to numeric type and remove NaN entries             
weather.numeric()
weather.summarize()


--------------------------------------------------------------------------------
********************    Begin of the summary of text data   ********************
--------------------------------------------------------------------------------
count           1294
unique          1294
top       2008-10-30
freq               1
Name: Date, dtype: object


count     1294
unique      62
top           
freq       730
Name: CodeSum, dtype: object


count     1294
unique       1
top          M
freq      1294
Name: Water1, dtype: object


--------------------------------------------------------------------------------
********************    End of the summary of text data     ********************
--------------------------------------------------------------------------------