In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
In [4]:
# Feature: Month, Week, Latitude, Longitude, NumMosq in Nearest Area, Near Dis, TMax, Tmin,
# date = get("Date") -> weather
# month = float(date.split('-')[1]) -> weather
# week = int(date.split('-')[1]) * 4 + int(date.split('-')[2]) / 7 -> weather
# latitude = float(get("Latitude"))
# longitude = float(get("Longitude"))
# tmax = float(weather_dic[date][weather_indexes["Tmax"]]) -> weather
# tmin = float(weather_dic[date][weather_indexes["Tmin"]]) -> weather
# tavg = float(weather_dic[date][weather_indexes["Tavg"]]) -> weather
# dewpoint = float(weather_dic[date][weather_indexes["DewPoint"]]) -> weather
# wetbulb = float(weather_dic[date][weather_indexes["WetBulb"]]) -> weather
# pressure = float(weather_dic[date][weather_indexes["StnPressure"]])-> weather
In [64]:
df_weather = pd.read_csv("/Users/dikien/Downloads/West Nile Virus Prediction/weather.csv")
df_weather.columns
Out[64]:
In [66]:
df_weather = df_weather[['Date', 'Station', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'StnPressure']]
df_weather.head(2)
Out[66]:
In [56]:
# Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level
# Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level
In [80]:
def closest_station(x):
lat, longi = x[0], x[1]
# Chicago is small enough that we can treat coordinates as rectangular.
stations = np.array([[41.995, -87.933],
[41.786, -87.752]])
loc = np.array([lat, longi])
deltas = stations - loc[None, :]
dist2 = (deltas**2).sum(1)
return np.argmin(dist2) + 1
In [61]:
df_train = pd.read_csv("/Users/dikien/Downloads/West Nile Virus Prediction/train.csv")
df_train.columns
Out[61]:
In [62]:
df_train = df_train[['Date', 'Species', 'Latitude', 'Longitude', 'NumMosquitos', 'WnvPresent']]
df_train['month'] = pd.DatetimeIndex(df_train['Date']).month
df_train['week'] = pd.DatetimeIndex(df_train['Date']).week
In [67]:
df_train.head(2)
Out[67]:
In [123]:
df_train.describe()
Out[123]:
In [69]:
species_map = {'CULEX RESTUANS' : "100000",
'CULEX TERRITANS' : "010000",
'CULEX PIPIENS' : "001000",
'CULEX PIPIENS/RESTUANS' : "101000",
'CULEX ERRATICUS' : "000100",
'CULEX SALINARIUS': "000010",
'CULEX TARSALIS' : "000001",
'UNSPECIFIED CULEX': "001000"}
In [71]:
df_train['Species'] = df_train['Species'].map(lambda x : species_map[x])
df_train.head(2)
Out[71]:
In [82]:
df_train['Station'] = df_train[['Latitude', 'Longitude']].apply(func=closest_station, axis=1)
# 0 : apply function to each column
# 1 : apply function to each row
df_train.head(2)
Out[82]:
In [74]:
df_weather[df_weather['Date'] == '2007-05-31']
Out[74]:
In [72]:
df_train[df_train['Date'] == '2007-05-29'].head(2)
Out[72]:
In [105]:
# df_train[:50]
# df_weather[55:90]
# merge전에 위에 데이터로 테스트해봄
In [106]:
df = df_train.merge(df_weather, how='inner', on=['Date', 'Station'])
df.head(3)
Out[106]:
In [110]:
df[['Species', 'Latitude', 'Longitude', 'NumMosquitos', 'month', 'week',
'Tmax', 'Tmin', 'Tavg', 'Tavg', 'DewPoint', 'WetBulb', 'StnPressure']]
# df[:5].as_matrix()
Out[110]:
In [219]:
features = df[['Species', 'Latitude', 'Longitude', 'NumMosquitos', 'month', 'week',
'Tmax', 'Tmin', 'Tavg', 'Tavg', 'DewPoint', 'WetBulb', 'StnPressure']].convert_objects(convert_numeric=True)
In [220]:
# null값이 있는 field는 WetBulb, StnPressure
features.isnull().any(0)
Out[220]:
In [221]:
# null 값은 총 26개가 있다, 뺄까? 중간값할까? 평균값으로 할까? 중간값으로 하자
print features['WetBulb'].isnull().sum()
print features['StnPressure'].isnull().sum()
In [210]:
features['WetBulb'][features['WetBulb'].isnull()]
Out[210]:
In [222]:
m = features['WetBulb'].median()
features['WetBulb'] = features['WetBulb'].fillna(m)
features.isnull().any(0)
Out[222]:
In [223]:
m = features['StnPressure'].median()
features['StnPressure'] = features['StnPressure'].fillna(m)
features.isnull().any(0)
Out[223]:
In [224]:
features.dtypes
Out[224]:
In [227]:
features = features.values.astype("float32")
features.dtype
Out[227]:
In [228]:
def scale(X, eps = 0.001):
# scale the data points s.t the columns of the feature space
# (i.e the predictors) are within the range [0, 1]
return (X - np.min(X, axis = 0)) / (np.max(X, axis = 0) + eps)
In [229]:
features = scale(features)
In [230]:
features.shape
Out[230]:
In [233]:
labels = df['WnvPresent'].astype(np.int).values.reshape(features.shape[0], 1)
In [234]:
print features.shape
print labels.shape
In [236]:
shuffle(features, labels, random_state=111)
Out[236]:
In [239]:
scaler = StandardScaler()
scaler.fit(features)
features = scaler.transform(features)
labels = np_utils.to_categorical(labels)
In [243]:
print features[0]
print labels[0]
In [242]:
a = StandardScaler()
a.fit(np.array([1,2,3,4,5,6,7,11,12,15,100,200]))
a.transform([1,2,3,4,5,6,7,11,12,15,100,200])
Out[242]:
In [2]:
df_test = pd.read_csv("/Users/dikien/Downloads/West Nile Virus Prediction/test.csv")
df_test.columns
Out[2]: