In [11]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
from sklearn import metrics
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.cross_validation import train_test_split

In [12]:
species_map = {'CULEX RESTUANS' : "100000",
              'CULEX TERRITANS' : "010000", 
              'CULEX PIPIENS'   : "001000", 
              'CULEX PIPIENS/RESTUANS' : "101000", 
              'CULEX ERRATICUS' : "000100", 
              'CULEX SALINARIUS': "000010", 
              'CULEX TARSALIS' :  "000001",
              'UNSPECIFIED CULEX': "001000"}

In [13]:
def closest_station(x):
    lat, longi = x[0], x[1]
    # Chicago is small enough that we can treat coordinates as rectangular.
    stations = np.array([[41.995, -87.933],
                         [41.786, -87.752]])
    loc = np.array([lat, longi])
    deltas = stations - loc[None, :]
    dist2 = (deltas**2).sum(1)
    return np.argmin(dist2) + 1

In [14]:
def scale(X, eps = 0.001):
    # scale the data points s.t the columns of the feature space
    # (i.e the predictors) are within the range [0, 1]
    return (X - np.min(X, axis = 0)) / (np.max(X, axis = 0) + eps)

In [19]:
def preprocess(train, weather, random_state):
    
    # weather 정보 불러오기
    df_weather = pd.read_csv(weather)
    
    # 필요한 컬럼만 추출
    df_weather = df_weather[['Date', 'Station', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'StnPressure']]
    
    # train 정보 불러오기
    df_train = pd.read_csv(train)
    # 필요한 컬럼만 추출
    df_train = df_train[['Date', 'Species', 'Latitude', 'Longitude', 'NumMosquitos', 'WnvPresent']]
    # 년 -> 월, 주 컬럼 추가
    df_train['month'] = pd.DatetimeIndex(df_train['Date']).month
    df_train['week'] = pd.DatetimeIndex(df_train['Date']).week
    
    # Species가 문자열이라서 숫자로 변경
    df_train['Species'] = df_train['Species'].map(lambda x : species_map[x])
    
    # 위도 경도를 기준으로 가까운 스테이션을 찾기 위해 Station 컬럼을 추가
    df_train['Station'] = df_train[['Latitude', 'Longitude']].apply(func=closest_station, axis=1)
    
    # Date, Station 을 기준으로 두개의 데이터 통합
    df = df_train.merge(df_weather, how='inner', on=['Date', 'Station'])
    
    # 값들을 숫자값으로 변경
    features = df[['Species', 'Latitude', 'Longitude', 'NumMosquitos', 'month', 'week',
           'Tmax', 'Tmin', 'Tavg', 'Tavg', 'DewPoint', 'WetBulb', 'StnPressure']].convert_objects(convert_numeric=True)
    
    # null값을 그 필드의 중간값으로 변경
    m = features['WetBulb'].median()
    features['WetBulb'] = features['WetBulb'].fillna(m)
    
    m = features['StnPressure'].median()
    features['StnPressure'] = features['StnPressure'].fillna(m)
    
    # data type변경
    features = features.values.astype("float32")
    
    # 스케일을 [0, 1]로 변경
    features = scale(features)
    
    # label을 추가
    labels = df['WnvPresent'].astype(np.int).values.reshape(features.shape[0], 1)
    
    features, labels = shuffle(features, labels, random_state=random_state)
    return features, labels


#     labels = np_utils.to_categorical(labels)    
#     scaler = StandardScaler()
#     scaler.fit(features)
#     features = scaler.transform(features)

In [24]:
# 위를 바탕으로 함수를 만들어 보자.

def preprocess_test(train, weather):
    
    # weather 정보 불러오기
    df_weather = pd.read_csv(weather)
    
    # 필요한 컬럼만 추출
    df_weather = df_weather[['Date', 'Station', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'StnPressure']]
    
    # train 정보 불러오기
    df_train = pd.read_csv(train)
    # 필요한 컬럼만 추출
    df_train = df_train[['Date', 'Species', 'Latitude', 'Longitude', 'Id']]
    # 년 -> 월, 주 컬럼 추가
    df_train['month'] = pd.DatetimeIndex(df_train['Date']).month
    df_train['week'] = pd.DatetimeIndex(df_train['Date']).week
    
    # Species가 문자열이라서 숫자로 변경
    df_train['Species'] = df_train['Species'].map(lambda x : species_map[x])
    
    # 위도 경도를 기준으로 가까운 스테이션을 찾기 위해 Station 컬럼을 추가
    df_train['Station'] = df_train[['Latitude', 'Longitude']].apply(func=closest_station, axis=1)
    
    # Date, Station 을 기준으로 두개의 데이터 통합
    df = df_train.merge(df_weather, how='inner', on=['Date', 'Station'])
    
    # 값들을 숫자값으로 변경
    features = df[['Species', 'Latitude', 'Longitude', 'Id', 'month', 'week',
           'Tmax', 'Tmin', 'Tavg', 'Tavg', 'DewPoint', 'WetBulb', 'StnPressure']].convert_objects(convert_numeric=True)
    
    # null값을 그 필드의 중간값으로 변경
    m = features['WetBulb'].median()
    features['WetBulb'] = features['WetBulb'].fillna(m)
    
    m = features['StnPressure'].median()
    features['StnPressure'] = features['StnPressure'].fillna(m)
    
    # data type변경
    features = features.values.astype("float32")
    
    # 스케일을 [0, 1]로 변경
    features = scale(features)

    return features

In [16]:
random_state = 1111

In [17]:
features, labels = preprocess("/Users/dikien/Downloads/West Nile Virus Prediction/train.csv",
                            "/Users/dikien/Downloads/West Nile Virus Prediction/weather.csv",
                           random_state=random_state)
(trainX, valX, trainY, valY) = train_test_split(features, labels, test_size = 0.1)

trainY = np_utils.to_categorical(trainY, 2)
valY = np_utils.to_categorical(valY, 2)

In [18]:
input_dim = trainX.shape[1]
output_dim = 2

model = Sequential()
model.add(Dense(input_dim, 32, init='lecun_uniform'))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(32, 32, init='lecun_uniform'))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(32, output_dim, init='lecun_uniform'))
model.add(Activation('softmax'))

print("Building model...")
model.compile(loss='categorical_crossentropy', optimizer="adadelta")

print("Training model...")
model.fit(trainX, trainY, nb_epoch=5, batch_size=16, validation_data=(valX, valY), verbose=1)

score = model.evaluate(valX, valY, show_accuracy=True, verbose=1, batch_size=32)
print 'score : %s' %score[1]

valid_preds = model.predict_proba(valX, verbose=1)
valid_preds = valid_preds[:, 0]
roc = metrics.roc_auc_score(valY, valid_preds)
print("ROC:", roc)


Building model...
Training model...
Train on 9455 samples, validate on 1051 samples
Epoch 0
9455/9455 [==============================] - 0s - loss: 0.2292 - val. loss: 0.1848
Epoch 1
9455/9455 [==============================] - 0s - loss: 0.1982 - val. loss: 0.1772
Epoch 2
9455/9455 [==============================] - 0s - loss: 0.1927 - val. loss: 0.1721
Epoch 3
9455/9455 [==============================] - 0s - loss: 0.1870 - val. loss: 0.1696
Epoch 4
9455/9455 [==============================] - 0s - loss: 0.1858 - val. loss: 0.1673
1051/1051 [==============================] - 0s - loss: 0.1679 - acc.: 0.9486     
score : 0.948863636364
1051/1051 [==============================] - 0s     
('ROC:', 0.82793751625246115)

In [25]:
testX = preprocess_test("/Users/dikien/Downloads/West Nile Virus Prediction/test.csv",
                        "/Users/dikien/Downloads/West Nile Virus Prediction/weather.csv")

valid_preds = model.predict_proba(testX, verbose=1)
valid_preds = valid_preds[: , 1]


116293/116293 [==============================] - 0s     

In [26]:
valid_preds.shape


Out[26]:
(116293,)

In [84]:
df1 = pd.read_csv("/Users/dikien/Downloads/West Nile Virus Prediction/test.csv", usecols=['Id'], dtype={'Id': np.int})
df2 = pd.DataFrame({'WnvPresent' : valid_preds})
df = df1.join(df2)
df.to_csv('Submission.csv', index=False)