In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

In [4]:
# Feature: Month, Week, Latitude, Longitude, NumMosq in Nearest Area, Near Dis, TMax, Tmin, 
# date = get("Date") -> weather
# month = float(date.split('-')[1]) -> weather
# week = int(date.split('-')[1]) * 4 + int(date.split('-')[2]) / 7 -> weather
# latitude = float(get("Latitude"))
# longitude = float(get("Longitude"))
# tmax = float(weather_dic[date][weather_indexes["Tmax"]]) -> weather
# tmin = float(weather_dic[date][weather_indexes["Tmin"]]) -> weather
# tavg = float(weather_dic[date][weather_indexes["Tavg"]]) -> weather
# dewpoint = float(weather_dic[date][weather_indexes["DewPoint"]]) -> weather
# wetbulb = float(weather_dic[date][weather_indexes["WetBulb"]]) -> weather
# pressure = float(weather_dic[date][weather_indexes["StnPressure"]])-> weather

In [64]:
df_weather = pd.read_csv("/Users/dikien/Downloads/West Nile Virus Prediction/weather.csv")
df_weather.columns


Out[64]:
Index([u'Station', u'Date', u'Tmax', u'Tmin', u'Tavg', u'Depart', u'DewPoint',
       u'WetBulb', u'Heat', u'Cool', u'Sunrise', u'Sunset', u'CodeSum',
       u'Depth', u'Water1', u'SnowFall', u'PrecipTotal', u'StnPressure',
       u'SeaLevel', u'ResultSpeed', u'ResultDir', u'AvgSpeed'],
      dtype='object')

In [66]:
df_weather = df_weather[['Date', 'Station', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'StnPressure']]
df_weather.head(2)


Out[66]:
Date Station Tmax Tmin Tavg DewPoint WetBulb StnPressure
0 2007-05-01 1 83 50 67 51 56 29.10
1 2007-05-01 2 84 52 68 51 57 29.18

In [56]:
# Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level
# Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level

In [80]:
def closest_station(x):
    lat, longi = x[0], x[1]
    # Chicago is small enough that we can treat coordinates as rectangular.
    stations = np.array([[41.995, -87.933],
                         [41.786, -87.752]])
    loc = np.array([lat, longi])
    deltas = stations - loc[None, :]
    dist2 = (deltas**2).sum(1)
    return np.argmin(dist2) + 1

In [61]:
df_train = pd.read_csv("/Users/dikien/Downloads/West Nile Virus Prediction/train.csv")
df_train.columns


Out[61]:
Index([u'Date', u'Address', u'Species', u'Block', u'Street', u'Trap',
       u'AddressNumberAndStreet', u'Latitude', u'Longitude',
       u'AddressAccuracy', u'NumMosquitos', u'WnvPresent'],
      dtype='object')

In [62]:
df_train = df_train[['Date', 'Species', 'Latitude', 'Longitude', 'NumMosquitos', 'WnvPresent']]
df_train['month'] = pd.DatetimeIndex(df_train['Date']).month
df_train['week'] = pd.DatetimeIndex(df_train['Date']).week

In [67]:
df_train.head(2)


Out[67]:
Date Species Latitude Longitude NumMosquitos WnvPresent month week
0 2007-05-29 CULEX PIPIENS/RESTUANS 41.95469 -87.800991 1 0 5 22
1 2007-05-29 CULEX RESTUANS 41.95469 -87.800991 1 0 5 22

In [123]:
df_train.describe()


Out[123]:
Latitude Longitude NumMosquitos WnvPresent month week Station
count 10506.000000 10506.000000 10506.000000 10506.000000 10506.000000 10506.000000 10506.000000
mean 41.841139 -87.699908 12.853512 0.052446 7.692557 31.687131 1.784790
std 0.112742 0.096514 16.133816 0.222936 1.067675 4.526874 0.410988
min 41.644612 -87.930995 1.000000 0.000000 5.000000 22.000000 1.000000
25% 41.732984 -87.760070 2.000000 0.000000 7.000000 28.000000 2.000000
50% 41.846283 -87.694991 5.000000 0.000000 8.000000 31.000000 2.000000
75% 41.954690 -87.627796 17.000000 0.000000 8.000000 35.000000 2.000000
max 42.017430 -87.531635 50.000000 1.000000 10.000000 41.000000 2.000000

In [69]:
species_map = {'CULEX RESTUANS' : "100000",
              'CULEX TERRITANS' : "010000", 
              'CULEX PIPIENS'   : "001000", 
              'CULEX PIPIENS/RESTUANS' : "101000", 
              'CULEX ERRATICUS' : "000100", 
              'CULEX SALINARIUS': "000010", 
              'CULEX TARSALIS' :  "000001",
              'UNSPECIFIED CULEX': "001000"}

In [71]:
df_train['Species'] = df_train['Species'].map(lambda x : species_map[x])
df_train.head(2)


Out[71]:
Date Species Latitude Longitude NumMosquitos WnvPresent month week
0 2007-05-29 101000 41.95469 -87.800991 1 0 5 22
1 2007-05-29 100000 41.95469 -87.800991 1 0 5 22

In [82]:
df_train['Station'] = df_train[['Latitude', 'Longitude']].apply(func=closest_station, axis=1)
# 0 : apply function to each column
# 1 : apply function to each row
df_train.head(2)


Out[82]:
Date Species Latitude Longitude NumMosquitos WnvPresent month week Station
0 2007-05-29 101000 41.95469 -87.800991 1 0 5 22 1
1 2007-05-29 100000 41.95469 -87.800991 1 0 5 22 1

In [74]:
df_weather[df_weather['Date'] == '2007-05-31']


Out[74]:
Date Station Tmax Tmin Tavg DewPoint WetBulb StnPressure
60 2007-05-31 1 78 65 72 61 65 29.28
61 2007-05-31 2 80 68 74 61 66 29.34

In [72]:
df_train[df_train['Date'] == '2007-05-29'].head(2)


Out[72]:
Date Species Latitude Longitude NumMosquitos WnvPresent month week
0 2007-05-29 101000 41.95469 -87.800991 1 0 5 22
1 2007-05-29 100000 41.95469 -87.800991 1 0 5 22

In [105]:
# df_train[:50]
# df_weather[55:90]
# merge전에 위에 데이터로 테스트해봄

In [106]:
df = df_train.merge(df_weather, how='inner', on=['Date', 'Station'])
df.head(3)


Out[106]:
Date Species Latitude Longitude NumMosquitos WnvPresent month week Station Tmax Tmin Tavg DewPoint WetBulb StnPressure
0 2007-05-29 101000 41.954690 -87.800991 1 0 5 22 1 88 60 74 58 65 29.39
1 2007-05-29 100000 41.954690 -87.800991 1 0 5 22 1 88 60 74 58 65 29.39
2 2007-05-29 100000 41.994991 -87.769279 1 0 5 22 1 88 60 74 58 65 29.39

In [110]:
df[['Species', 'Latitude', 'Longitude', 'NumMosquitos', 'month', 'week',
   'Tmax', 'Tmin', 'Tavg', 'Tavg', 'DewPoint', 'WetBulb', 'StnPressure']]
# df[:5].as_matrix()


Out[110]:
array([['2007-05-29', '101000', 41.95469, -87.800991, 1, 0, 5, 22, 1, 88,
        60, '74', 58, '65', '29.39'],
       ['2007-05-29', '100000', 41.95469, -87.800991, 1, 0, 5, 22, 1, 88,
        60, '74', 58, '65', '29.39'],
       ['2007-05-29', '100000', 41.994991, -87.769279, 1, 0, 5, 22, 1, 88,
        60, '74', 58, '65', '29.39'],
       ['2007-05-29', '101000', 41.974089, -87.824812, 1, 0, 5, 22, 1, 88,
        60, '74', 58, '65', '29.39'],
       ['2007-05-29', '100000', 41.974089, -87.824812, 4, 0, 5, 22, 1, 88,
        60, '74', 58, '65', '29.39']], dtype=object)

In [219]:
features = df[['Species', 'Latitude', 'Longitude', 'NumMosquitos', 'month', 'week',
           'Tmax', 'Tmin', 'Tavg', 'Tavg', 'DewPoint', 'WetBulb', 'StnPressure']].convert_objects(convert_numeric=True)

In [220]:
# null값이 있는 field는 WetBulb, StnPressure 
features.isnull().any(0)


Out[220]:
Species         False
Latitude        False
Longitude       False
NumMosquitos    False
month           False
week            False
Tmax            False
Tmin            False
Tavg            False
Tavg            False
DewPoint        False
WetBulb          True
StnPressure      True
dtype: bool

In [221]:
# null 값은 총 26개가 있다, 뺄까? 중간값할까? 평균값으로 할까? 중간값으로 하자
print features['WetBulb'].isnull().sum()
print features['StnPressure'].isnull().sum()


26
26

In [210]:
features['WetBulb'][features['WetBulb'].isnull()]


Out[210]:
4324   NaN
4325   NaN
4326   NaN
4327   NaN
4328   NaN
4329   NaN
4330   NaN
4331   NaN
4332   NaN
4333   NaN
4334   NaN
4335   NaN
4336   NaN
4337   NaN
4338   NaN
4339   NaN
4340   NaN
4341   NaN
4342   NaN
4343   NaN
4344   NaN
4345   NaN
4346   NaN
4347   NaN
4348   NaN
4349   NaN
Name: WetBulb, dtype: float64

In [222]:
m = features['WetBulb'].median()
features['WetBulb'] = features['WetBulb'].fillna(m)
features.isnull().any(0)


Out[222]:
Species         False
Latitude        False
Longitude       False
NumMosquitos    False
month           False
week            False
Tmax            False
Tmin            False
Tavg            False
Tavg            False
DewPoint        False
WetBulb         False
StnPressure      True
dtype: bool

In [223]:
m = features['StnPressure'].median()
features['StnPressure'] = features['StnPressure'].fillna(m)
features.isnull().any(0)


Out[223]:
Species         False
Latitude        False
Longitude       False
NumMosquitos    False
month           False
week            False
Tmax            False
Tmin            False
Tavg            False
Tavg            False
DewPoint        False
WetBulb         False
StnPressure     False
dtype: bool

In [224]:
features.dtypes


Out[224]:
Species           int64
Latitude        float64
Longitude       float64
NumMosquitos      int64
month             int32
week              int32
Tmax              int64
Tmin              int64
Tavg              int64
Tavg              int64
DewPoint          int64
WetBulb         float64
StnPressure     float64
dtype: object

In [227]:
features = features.values.astype("float32")
features.dtype


Out[227]:
dtype('float32')

In [228]:
def scale(X, eps = 0.001):
    # scale the data points s.t the columns of the feature space
    # (i.e the predictors) are within the range [0, 1]
    return (X - np.min(X, axis = 0)) / (np.max(X, axis = 0) + eps)

In [229]:
features = scale(features)

In [230]:
features.shape


Out[230]:
(10506, 13)

In [233]:
labels = df['WnvPresent'].astype(np.int).values.reshape(features.shape[0], 1)

In [234]:
print features.shape
print labels.shape


(10506, 13)
(10506, 1)

In [236]:
shuffle(features, labels, random_state=111)


Out[236]:
[array([[  9.89108905e-03,   6.85345731e-04,  -3.78302974e-03, ...,
           6.84922114e-02,   5.26308864e-02,   1.92236248e-02],
        [  9.90089118e-01,   8.87272600e-03,  -2.77874270e-03, ...,
           4.38350171e-01,   3.68416220e-01,   9.78047680e-03],
        [  9.99990106e-01,   7.37956353e-03,  -1.48525008e-03, ...,
           2.73968846e-01,   2.36838996e-01,   1.65255740e-02],
        ..., 
        [  9.90089118e-01,   7.14587979e-03,  -1.12221809e-03, ...,
           3.97254825e-01,   3.02627593e-01,   2.69805151e-03],
        [  9.89108905e-03,   5.06424066e-03,  -2.69619958e-03, ...,
           1.91778198e-01,   1.44734934e-01,   1.07922060e-02],
        [  9.99990106e-01,   5.86660998e-03,  -3.15894582e-03, ...,
           2.73968846e-01,   2.36838996e-01,   1.51765803e-02]], dtype=float32),
 array([[0],
        [0],
        [0],
        ..., 
        [0],
        [0],
        [0]])]

In [239]:
scaler = StandardScaler()
scaler.fit(features)
features = scaler.transform(features)

labels = np_utils.to_categorical(labels)

In [243]:
print features[0]
print labels[0]


[ 0.64218324  1.00721312  1.04734027 -0.73472261 -2.5219357  -2.14002585
  0.73019904 -0.55330187  0.0956296   0.0956296  -0.20868506  0.01431369
  0.79072708]
[ 1.  0.]

In [242]:
a = StandardScaler()
a.fit(np.array([1,2,3,4,5,6,7,11,12,15,100,200]))
a.transform([1,2,3,4,5,6,7,11,12,15,100,200])


/Users/dikien/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.py:332: UserWarning: StandardScaler assumes floating point values as input, got int64
  "got %s" % (estimator, X.dtype))
Out[242]:
array([-0.51439363, -0.49695656, -0.47951948, -0.46208241, -0.44464534,
       -0.42720827, -0.4097712 , -0.34002291, -0.32258584, -0.27027462,
        1.21187652,  2.95558373])

In [2]:
df_test = pd.read_csv("/Users/dikien/Downloads/West Nile Virus Prediction/test.csv")
df_test.columns


Out[2]:
Index([u'Id', u'Date', u'Address', u'Species', u'Block', u'Street', u'Trap',
       u'AddressNumberAndStreet', u'Latitude', u'Longitude',
       u'AddressAccuracy'],
      dtype='object')