In [1]:
import pandas as pd
In [3]:
ls
In [9]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [10]:
print 'train columns'
print train.columns
print "=" * 100
print 'test columns'
print test.columns
In [11]:
# datetime - hourly date + timestamp
# season - 1 = spring, 2 = summer, 3 = fall, 4 = winter
# holiday - whether the day is considered a holiday
# workingday - whether the day is neither a weekend nor holiday
# weather - 1: Clear, Few clouds, Partly cloudy
# 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
# 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
# 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
# temp - temperature in Celsius
# atemp - "feels like" temperature in Celsius
# humidity - relative humidity
# windspeed - wind speed
# casual - number of non-registered user rentals initiated
# registered - number of registered user rentals initiated
# count - number of total rentals
In [25]:
train.head(5)
Out[25]:
In [17]:
test.head(5)
Out[17]:
In [31]:
# nan vlaues가 있는지 조사해보자
print train.isnull().any(0)
print
print test.isnull().any(0)
# train : humidity windspeed casual registered count 에 null값이 있군
# test : null값이 없다
In [38]:
# null값은 한개가 있네. 일부러 이렇게 한걸까?
train.isnull().sum()
Out[38]:
In [40]:
# null값을 제거하자
train = train.dropna()
train.isnull().sum()
Out[40]:
In [54]:
# month, week, hour을 새로운 feature로 추가해보자
train['month'] = pd.DatetimeIndex(train['datetime']).month
train['week'] = pd.DatetimeIndex(train['datetime']).week
train['hour'] = pd.DatetimeIndex(train['datetime']).hour
train.head(3)
Out[54]:
In [56]:
train.describe()
Out[56]:
In [65]:
features = train[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'month', 'week', 'hour']].values.astype("float32")
labels = train['count'].values.astype("int")
In [66]:
print features
print labels
In [67]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
In [68]:
parameters = {'min_samples_split' : [20], 'n_estimators' : [100, 200]}
clf = GridSearchCV(RandomForestClassifier(), parameters, n_jobs=1, verbose=1)
In [69]:
clf.fit(features, labels)
Out[69]:
In [78]:
print("Best score: %0.3f" % clf.best_score_)
print("Best parameters set:")
best_parameters = clf.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
In [80]:
sample = pd.read_csv('sampleSubmission.csv')
sample.head(2)
Out[80]: