In [11]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
from sklearn import metrics
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.cross_validation import train_test_split
In [12]:
species_map = {'CULEX RESTUANS' : "100000",
'CULEX TERRITANS' : "010000",
'CULEX PIPIENS' : "001000",
'CULEX PIPIENS/RESTUANS' : "101000",
'CULEX ERRATICUS' : "000100",
'CULEX SALINARIUS': "000010",
'CULEX TARSALIS' : "000001",
'UNSPECIFIED CULEX': "001000"}
In [13]:
def closest_station(x):
lat, longi = x[0], x[1]
# Chicago is small enough that we can treat coordinates as rectangular.
stations = np.array([[41.995, -87.933],
[41.786, -87.752]])
loc = np.array([lat, longi])
deltas = stations - loc[None, :]
dist2 = (deltas**2).sum(1)
return np.argmin(dist2) + 1
In [14]:
def scale(X, eps = 0.001):
# scale the data points s.t the columns of the feature space
# (i.e the predictors) are within the range [0, 1]
return (X - np.min(X, axis = 0)) / (np.max(X, axis = 0) + eps)
In [19]:
def preprocess(train, weather, random_state):
# weather 정보 불러오기
df_weather = pd.read_csv(weather)
# 필요한 컬럼만 추출
df_weather = df_weather[['Date', 'Station', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'StnPressure']]
# train 정보 불러오기
df_train = pd.read_csv(train)
# 필요한 컬럼만 추출
df_train = df_train[['Date', 'Species', 'Latitude', 'Longitude', 'NumMosquitos', 'WnvPresent']]
# 년 -> 월, 주 컬럼 추가
df_train['month'] = pd.DatetimeIndex(df_train['Date']).month
df_train['week'] = pd.DatetimeIndex(df_train['Date']).week
# Species가 문자열이라서 숫자로 변경
df_train['Species'] = df_train['Species'].map(lambda x : species_map[x])
# 위도 경도를 기준으로 가까운 스테이션을 찾기 위해 Station 컬럼을 추가
df_train['Station'] = df_train[['Latitude', 'Longitude']].apply(func=closest_station, axis=1)
# Date, Station 을 기준으로 두개의 데이터 통합
df = df_train.merge(df_weather, how='inner', on=['Date', 'Station'])
# 값들을 숫자값으로 변경
features = df[['Species', 'Latitude', 'Longitude', 'NumMosquitos', 'month', 'week',
'Tmax', 'Tmin', 'Tavg', 'Tavg', 'DewPoint', 'WetBulb', 'StnPressure']].convert_objects(convert_numeric=True)
# null값을 그 필드의 중간값으로 변경
m = features['WetBulb'].median()
features['WetBulb'] = features['WetBulb'].fillna(m)
m = features['StnPressure'].median()
features['StnPressure'] = features['StnPressure'].fillna(m)
# data type변경
features = features.values.astype("float32")
# 스케일을 [0, 1]로 변경
features = scale(features)
# label을 추가
labels = df['WnvPresent'].astype(np.int).values.reshape(features.shape[0], 1)
features, labels = shuffle(features, labels, random_state=random_state)
return features, labels
# labels = np_utils.to_categorical(labels)
# scaler = StandardScaler()
# scaler.fit(features)
# features = scaler.transform(features)
In [24]:
# 위를 바탕으로 함수를 만들어 보자.
def preprocess_test(train, weather):
# weather 정보 불러오기
df_weather = pd.read_csv(weather)
# 필요한 컬럼만 추출
df_weather = df_weather[['Date', 'Station', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'StnPressure']]
# train 정보 불러오기
df_train = pd.read_csv(train)
# 필요한 컬럼만 추출
df_train = df_train[['Date', 'Species', 'Latitude', 'Longitude', 'Id']]
# 년 -> 월, 주 컬럼 추가
df_train['month'] = pd.DatetimeIndex(df_train['Date']).month
df_train['week'] = pd.DatetimeIndex(df_train['Date']).week
# Species가 문자열이라서 숫자로 변경
df_train['Species'] = df_train['Species'].map(lambda x : species_map[x])
# 위도 경도를 기준으로 가까운 스테이션을 찾기 위해 Station 컬럼을 추가
df_train['Station'] = df_train[['Latitude', 'Longitude']].apply(func=closest_station, axis=1)
# Date, Station 을 기준으로 두개의 데이터 통합
df = df_train.merge(df_weather, how='inner', on=['Date', 'Station'])
# 값들을 숫자값으로 변경
features = df[['Species', 'Latitude', 'Longitude', 'Id', 'month', 'week',
'Tmax', 'Tmin', 'Tavg', 'Tavg', 'DewPoint', 'WetBulb', 'StnPressure']].convert_objects(convert_numeric=True)
# null값을 그 필드의 중간값으로 변경
m = features['WetBulb'].median()
features['WetBulb'] = features['WetBulb'].fillna(m)
m = features['StnPressure'].median()
features['StnPressure'] = features['StnPressure'].fillna(m)
# data type변경
features = features.values.astype("float32")
# 스케일을 [0, 1]로 변경
features = scale(features)
return features
In [16]:
random_state = 1111
In [17]:
features, labels = preprocess("/Users/dikien/Downloads/West Nile Virus Prediction/train.csv",
"/Users/dikien/Downloads/West Nile Virus Prediction/weather.csv",
random_state=random_state)
(trainX, valX, trainY, valY) = train_test_split(features, labels, test_size = 0.1)
trainY = np_utils.to_categorical(trainY, 2)
valY = np_utils.to_categorical(valY, 2)
In [18]:
input_dim = trainX.shape[1]
output_dim = 2
model = Sequential()
model.add(Dense(input_dim, 32, init='lecun_uniform'))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(32, 32, init='lecun_uniform'))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(32, output_dim, init='lecun_uniform'))
model.add(Activation('softmax'))
print("Building model...")
model.compile(loss='categorical_crossentropy', optimizer="adadelta")
print("Training model...")
model.fit(trainX, trainY, nb_epoch=5, batch_size=16, validation_data=(valX, valY), verbose=1)
score = model.evaluate(valX, valY, show_accuracy=True, verbose=1, batch_size=32)
print 'score : %s' %score[1]
valid_preds = model.predict_proba(valX, verbose=1)
valid_preds = valid_preds[:, 0]
roc = metrics.roc_auc_score(valY, valid_preds)
print("ROC:", roc)
In [25]:
testX = preprocess_test("/Users/dikien/Downloads/West Nile Virus Prediction/test.csv",
"/Users/dikien/Downloads/West Nile Virus Prediction/weather.csv")
valid_preds = model.predict_proba(testX, verbose=1)
valid_preds = valid_preds[: , 1]
In [26]:
valid_preds.shape
Out[26]:
In [84]:
df1 = pd.read_csv("/Users/dikien/Downloads/West Nile Virus Prediction/test.csv", usecols=['Id'], dtype={'Id': np.int})
df2 = pd.DataFrame({'WnvPresent' : valid_preds})
df = df1.join(df2)
df.to_csv('Submission.csv', index=False)