notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
from sklearn.utils import shuffle



In [4]:

    
# Feature: Month, Week, Latitude, Longitude, NumMosq in Nearest Area, Near Dis, TMax, Tmin, 
# date = get("Date") -> weather
# month = float(date.split('-')[1]) -> weather
# week = int(date.split('-')[1]) * 4 + int(date.split('-')[2]) / 7 -> weather
# latitude = float(get("Latitude"))
# longitude = float(get("Longitude"))
# tmax = float(weather_dic[date][weather_indexes["Tmax"]]) -> weather
# tmin = float(weather_dic[date][weather_indexes["Tmin"]]) -> weather
# tavg = float(weather_dic[date][weather_indexes["Tavg"]]) -> weather
# dewpoint = float(weather_dic[date][weather_indexes["DewPoint"]]) -> weather
# wetbulb = float(weather_dic[date][weather_indexes["WetBulb"]]) -> weather
# pressure = float(weather_dic[date][weather_indexes["StnPressure"]])-> weather



In [64]:

    
df_weather = pd.read_csv("/Users/dikien/Downloads/West Nile Virus Prediction/weather.csv")
df_weather.columns









    Out[64]:





Index([u'Station', u'Date', u'Tmax', u'Tmin', u'Tavg', u'Depart', u'DewPoint',
       u'WetBulb', u'Heat', u'Cool', u'Sunrise', u'Sunset', u'CodeSum',
       u'Depth', u'Water1', u'SnowFall', u'PrecipTotal', u'StnPressure',
       u'SeaLevel', u'ResultSpeed', u'ResultDir', u'AvgSpeed'],
      dtype='object')



In [66]:

    
df_weather = df_weather[['Date', 'Station', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'StnPressure']]
df_weather.head(2)









    Out[66]:






  
    
      
      Date
      Station
      Tmax
      Tmin
      Tavg
      DewPoint
      WetBulb
      StnPressure
    
  
  
    
      0
      2007-05-01
      1
      83
      50
      67
      51
      56
      29.10
    
    
      1
      2007-05-01
      2
      84
      52
      68
      51
      57
      29.18



In [56]:

    
# Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level
# Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level



In [80]:

    
def closest_station(x):
    lat, longi = x[0], x[1]
    # Chicago is small enough that we can treat coordinates as rectangular.
    stations = np.array([[41.995, -87.933],
                         [41.786, -87.752]])
    loc = np.array([lat, longi])
    deltas = stations - loc[None, :]
    dist2 = (deltas**2).sum(1)
    return np.argmin(dist2) + 1



In [61]:

    
df_train = pd.read_csv("/Users/dikien/Downloads/West Nile Virus Prediction/train.csv")
df_train.columns









    Out[61]:





Index([u'Date', u'Address', u'Species', u'Block', u'Street', u'Trap',
       u'AddressNumberAndStreet', u'Latitude', u'Longitude',
       u'AddressAccuracy', u'NumMosquitos', u'WnvPresent'],
      dtype='object')



In [62]:

    
df_train = df_train[['Date', 'Species', 'Latitude', 'Longitude', 'NumMosquitos', 'WnvPresent']]
df_train['month'] = pd.DatetimeIndex(df_train['Date']).month
df_train['week'] = pd.DatetimeIndex(df_train['Date']).week



In [67]:

    
df_train.head(2)









    Out[67]:






  
    
      
      Date
      Species
      Latitude
      Longitude
      NumMosquitos
      WnvPresent
      month
      week
    
  
  
    
      0
      2007-05-29
      CULEX PIPIENS/RESTUANS
      41.95469
      -87.800991
      1
      0
      5
      22
    
    
      1
      2007-05-29
      CULEX RESTUANS
      41.95469
      -87.800991
      1
      0
      5
      22



In [123]:

    
df_train.describe()









    Out[123]:






  
    
      
      Latitude
      Longitude
      NumMosquitos
      WnvPresent
      month
      week
      Station
    
  
  
    
      count
      10506.000000
      10506.000000
      10506.000000
      10506.000000
      10506.000000
      10506.000000
      10506.000000
    
    
      mean
      41.841139
      -87.699908
      12.853512
      0.052446
      7.692557
      31.687131
      1.784790
    
    
      std
      0.112742
      0.096514
      16.133816
      0.222936
      1.067675
      4.526874
      0.410988
    
    
      min
      41.644612
      -87.930995
      1.000000
      0.000000
      5.000000
      22.000000
      1.000000
    
    
      25%
      41.732984
      -87.760070
      2.000000
      0.000000
      7.000000
      28.000000
      2.000000
    
    
      50%
      41.846283
      -87.694991
      5.000000
      0.000000
      8.000000
      31.000000
      2.000000
    
    
      75%
      41.954690
      -87.627796
      17.000000
      0.000000
      8.000000
      35.000000
      2.000000
    
    
      max
      42.017430
      -87.531635
      50.000000
      1.000000
      10.000000
      41.000000
      2.000000



In [69]:

    
species_map = {'CULEX RESTUANS' : "100000",
              'CULEX TERRITANS' : "010000", 
              'CULEX PIPIENS'   : "001000", 
              'CULEX PIPIENS/RESTUANS' : "101000", 
              'CULEX ERRATICUS' : "000100", 
              'CULEX SALINARIUS': "000010", 
              'CULEX TARSALIS' :  "000001",
              'UNSPECIFIED CULEX': "001000"}



In [71]:

    
df_train['Species'] = df_train['Species'].map(lambda x : species_map[x])
df_train.head(2)









    Out[71]:






  
    
      
      Date
      Species
      Latitude
      Longitude
      NumMosquitos
      WnvPresent
      month
      week
    
  
  
    
      0
      2007-05-29
      101000
      41.95469
      -87.800991
      1
      0
      5
      22
    
    
      1
      2007-05-29
      100000
      41.95469
      -87.800991
      1
      0
      5
      22



In [82]:

    
df_train['Station'] = df_train[['Latitude', 'Longitude']].apply(func=closest_station, axis=1)
# 0 : apply function to each column
# 1 : apply function to each row
df_train.head(2)









    Out[82]:






  
    
      
      Date
      Species
      Latitude
      Longitude
      NumMosquitos
      WnvPresent
      month
      week
      Station
    
  
  
    
      0
      2007-05-29
      101000
      41.95469
      -87.800991
      1
      0
      5
      22
      1
    
    
      1
      2007-05-29
      100000
      41.95469
      -87.800991
      1
      0
      5
      22
      1



In [74]:

    
df_weather[df_weather['Date'] == '2007-05-31']









    Out[74]:






  
    
      
      Date
      Station
      Tmax
      Tmin
      Tavg
      DewPoint
      WetBulb
      StnPressure
    
  
  
    
      60
      2007-05-31
      1
      78
      65
      72
      61
      65
      29.28
    
    
      61
      2007-05-31
      2
      80
      68
      74
      61
      66
      29.34



In [72]:

    
df_train[df_train['Date'] == '2007-05-29'].head(2)









    Out[72]:






  
    
      
      Date
      Species
      Latitude
      Longitude
      NumMosquitos
      WnvPresent
      month
      week
    
  
  
    
      0
      2007-05-29
      101000
      41.95469
      -87.800991
      1
      0
      5
      22
    
    
      1
      2007-05-29
      100000
      41.95469
      -87.800991
      1
      0
      5
      22



In [105]:

    
# df_train[:50]
# df_weather[55:90]
# merge전에 위에 데이터로 테스트해봄



In [106]:

    
df = df_train.merge(df_weather, how='inner', on=['Date', 'Station'])
df.head(3)









    Out[106]:






  
    
      
      Date
      Species
      Latitude
      Longitude
      NumMosquitos
      WnvPresent
      month
      week
      Station
      Tmax
      Tmin
      Tavg
      DewPoint
      WetBulb
      StnPressure
    
  
  
    
      0
      2007-05-29
      101000
      41.954690
      -87.800991
      1
      0
      5
      22
      1
      88
      60
      74
      58
      65
      29.39
    
    
      1
      2007-05-29
      100000
      41.954690
      -87.800991
      1
      0
      5
      22
      1
      88
      60
      74
      58
      65
      29.39
    
    
      2
      2007-05-29
      100000
      41.994991
      -87.769279
      1
      0
      5
      22
      1
      88
      60
      74
      58
      65
      29.39



In [110]:

    
df[['Species', 'Latitude', 'Longitude', 'NumMosquitos', 'month', 'week',
   'Tmax', 'Tmin', 'Tavg', 'Tavg', 'DewPoint', 'WetBulb', 'StnPressure']]
# df[:5].as_matrix()









    Out[110]:





array([['2007-05-29', '101000', 41.95469, -87.800991, 1, 0, 5, 22, 1, 88,
        60, '74', 58, '65', '29.39'],
       ['2007-05-29', '100000', 41.95469, -87.800991, 1, 0, 5, 22, 1, 88,
        60, '74', 58, '65', '29.39'],
       ['2007-05-29', '100000', 41.994991, -87.769279, 1, 0, 5, 22, 1, 88,
        60, '74', 58, '65', '29.39'],
       ['2007-05-29', '101000', 41.974089, -87.824812, 1, 0, 5, 22, 1, 88,
        60, '74', 58, '65', '29.39'],
       ['2007-05-29', '100000', 41.974089, -87.824812, 4, 0, 5, 22, 1, 88,
        60, '74', 58, '65', '29.39']], dtype=object)



In [219]:

    
features = df[['Species', 'Latitude', 'Longitude', 'NumMosquitos', 'month', 'week',
           'Tmax', 'Tmin', 'Tavg', 'Tavg', 'DewPoint', 'WetBulb', 'StnPressure']].convert_objects(convert_numeric=True)



In [220]:

    
# null값이 있는 field는 WetBulb, StnPressure 
features.isnull().any(0)









    Out[220]:





Species         False
Latitude        False
Longitude       False
NumMosquitos    False
month           False
week            False
Tmax            False
Tmin            False
Tavg            False
Tavg            False
DewPoint        False
WetBulb          True
StnPressure      True
dtype: bool



In [221]:

    
# null 값은 총 26개가 있다, 뺄까? 중간값할까? 평균값으로 할까? 중간값으로 하자
print features['WetBulb'].isnull().sum()
print features['StnPressure'].isnull().sum()



In [210]:

    
features['WetBulb'][features['WetBulb'].isnull()]









    Out[210]:





4324   NaN
4325   NaN
4326   NaN
4327   NaN
4328   NaN
4329   NaN
4330   NaN
4331   NaN
4332   NaN
4333   NaN
4334   NaN
4335   NaN
4336   NaN
4337   NaN
4338   NaN
4339   NaN
4340   NaN
4341   NaN
4342   NaN
4343   NaN
4344   NaN
4345   NaN
4346   NaN
4347   NaN
4348   NaN
4349   NaN
Name: WetBulb, dtype: float64



In [222]:

    
m = features['WetBulb'].median()
features['WetBulb'] = features['WetBulb'].fillna(m)
features.isnull().any(0)









    Out[222]:





Species         False
Latitude        False
Longitude       False
NumMosquitos    False
month           False
week            False
Tmax            False
Tmin            False
Tavg            False
Tavg            False
DewPoint        False
WetBulb         False
StnPressure      True
dtype: bool



In [223]:

    
m = features['StnPressure'].median()
features['StnPressure'] = features['StnPressure'].fillna(m)
features.isnull().any(0)









    Out[223]:





Species         False
Latitude        False
Longitude       False
NumMosquitos    False
month           False
week            False
Tmax            False
Tmin            False
Tavg            False
Tavg            False
DewPoint        False
WetBulb         False
StnPressure     False
dtype: bool



In [224]:

    
features.dtypes









    Out[224]:





Species           int64
Latitude        float64
Longitude       float64
NumMosquitos      int64
month             int32
week              int32
Tmax              int64
Tmin              int64
Tavg              int64
Tavg              int64
DewPoint          int64
WetBulb         float64
StnPressure     float64
dtype: object



In [227]:

    
features = features.values.astype("float32")
features.dtype









    Out[227]:





dtype('float32')



In [228]:

    
def scale(X, eps = 0.001):
    # scale the data points s.t the columns of the feature space
    # (i.e the predictors) are within the range [0, 1]
    return (X - np.min(X, axis = 0)) / (np.max(X, axis = 0) + eps)



In [229]:

    
features = scale(features)



In [230]:

    
features.shape









    Out[230]:





(10506, 13)



In [233]:

    
labels = df['WnvPresent'].astype(np.int).values.reshape(features.shape[0], 1)



In [234]:

    
print features.shape
print labels.shape









    



(10506, 13)
(10506, 1)



In [236]:

    
shuffle(features, labels, random_state=111)









    Out[236]:





[array([[  9.89108905e-03,   6.85345731e-04,  -3.78302974e-03, ...,
           6.84922114e-02,   5.26308864e-02,   1.92236248e-02],
        [  9.90089118e-01,   8.87272600e-03,  -2.77874270e-03, ...,
           4.38350171e-01,   3.68416220e-01,   9.78047680e-03],
        [  9.99990106e-01,   7.37956353e-03,  -1.48525008e-03, ...,
           2.73968846e-01,   2.36838996e-01,   1.65255740e-02],
        ..., 
        [  9.90089118e-01,   7.14587979e-03,  -1.12221809e-03, ...,
           3.97254825e-01,   3.02627593e-01,   2.69805151e-03],
        [  9.89108905e-03,   5.06424066e-03,  -2.69619958e-03, ...,
           1.91778198e-01,   1.44734934e-01,   1.07922060e-02],
        [  9.99990106e-01,   5.86660998e-03,  -3.15894582e-03, ...,
           2.73968846e-01,   2.36838996e-01,   1.51765803e-02]], dtype=float32),
 array([[0],
        [0],
        [0],
        ..., 
        [0],
        [0],
        [0]])]



In [239]:

    
scaler = StandardScaler()
scaler.fit(features)
features = scaler.transform(features)

labels = np_utils.to_categorical(labels)



In [243]:

    
print features[0]
print labels[0]









    



[ 0.64218324  1.00721312  1.04734027 -0.73472261 -2.5219357  -2.14002585
  0.73019904 -0.55330187  0.0956296   0.0956296  -0.20868506  0.01431369
  0.79072708]
[ 1.  0.]



In [242]:

    
a = StandardScaler()
a.fit(np.array([1,2,3,4,5,6,7,11,12,15,100,200]))
a.transform([1,2,3,4,5,6,7,11,12,15,100,200])









    



/Users/dikien/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.py:332: UserWarning: StandardScaler assumes floating point values as input, got int64
  "got %s" % (estimator, X.dtype))






    Out[242]:





array([-0.51439363, -0.49695656, -0.47951948, -0.46208241, -0.44464534,
       -0.42720827, -0.4097712 , -0.34002291, -0.32258584, -0.27027462,
        1.21187652,  2.95558373])



In [2]:

    
df_test = pd.read_csv("/Users/dikien/Downloads/West Nile Virus Prediction/test.csv")
df_test.columns









    Out[2]:





Index([u'Id', u'Date', u'Address', u'Species', u'Block', u'Street', u'Trap',
       u'AddressNumberAndStreet', u'Latitude', u'Longitude',
       u'AddressAccuracy'],
      dtype='object')

	Date	Species	Latitude	Longitude	NumMosquitos	WnvPresent	month	week
0	2007-05-29	CULEX PIPIENS/RESTUANS	41.95469	-87.800991	1	0	5	22
1	2007-05-29	CULEX RESTUANS	41.95469	-87.800991	1	0	5	22

	Latitude	Longitude	NumMosquitos	WnvPresent	month	week	Station
count	10506.000000	10506.000000	10506.000000	10506.000000	10506.000000	10506.000000	10506.000000
mean	41.841139	-87.699908	12.853512	0.052446	7.692557	31.687131	1.784790
std	0.112742	0.096514	16.133816	0.222936	1.067675	4.526874	0.410988
min	41.644612	-87.930995	1.000000	0.000000	5.000000	22.000000	1.000000
25%	41.732984	-87.760070	2.000000	0.000000	7.000000	28.000000	2.000000
50%	41.846283	-87.694991	5.000000	0.000000	8.000000	31.000000	2.000000
75%	41.954690	-87.627796	17.000000	0.000000	8.000000	35.000000	2.000000
max	42.017430	-87.531635	50.000000	1.000000	10.000000	41.000000	2.000000

	Date	Species	Latitude	Longitude	NumMosquitos	WnvPresent	month	week
0	2007-05-29	101000	41.95469	-87.800991	1	0	5	22
1	2007-05-29	100000	41.95469	-87.800991	1	0	5	22