In [1]:
# from __future__ import exam_success
from __future__ import absolute_import
from __future__ import print_function
%matplotlib inline
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import pandas as pd
import scipy.stats as stats
# Sk cheats
from sklearn.cross_validation import cross_val_score # cross val
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import Imputer # get rid of nan
In [2]:
%%time
filename = "data/reduced_train_100000.csv"
#filename = "data/reduced_test_100000.csv"
raw = pd.read_csv(filename)
raw = raw.set_index('Id')
In [3]:
raw['Expected'].describe()
Out[3]:
Per wikipedia, a value of more than 421 mm/h is considered "Extreme/large hail"
If we encounter the value 327.40 meter per hour, we should probably start building Noah's ark
Therefor, it seems reasonable to drop values too large, considered as outliers
In [4]:
# Considering that the gauge may concentrate the rainfall, we set the cap to 1000
# Comment this line to analyse the complete dataset
l = len(raw)
raw = raw[raw['Expected'] < 1000]
print("Dropped %d (%0.2f%%)"%(l-len(raw),(l-len(raw))/float(l)*100))
In [5]:
raw.head(5)
Out[5]:
In [6]:
l = float(len(raw["minutes_past"]))
comp = [[1-raw[i].isnull().sum()/l , i] for i in raw.columns]
comp.sort(key=lambda x: x[0], reverse=True)
sns.barplot(zip(*comp)[0],zip(*comp)[1],palette=sns.cubehelix_palette(len(comp), start=.5, rot=-.75))
plt.title("Percentage of non NaN data")
plt.show()
We see that except for the fixed features minutes_past, radardist_km and Expected the dataset is mainly sparse.
Let's transform the dataset to conduct more analysis
We regroup the data by ID
In [8]:
# We select all features except for the minutes past,
# because we ignore the time repartition of the sequence for now
features_columns = list([u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th'])
def getXy(raw):
selected_columns = list([ u'radardist_km', u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th'])
data = raw[selected_columns]
docX, docY = [], []
for i in data.index.unique():
if isinstance(data.loc[i],pd.core.series.Series):
m = [data.loc[i].as_matrix()]
docX.append(m)
docY.append(float(raw.loc[i]["Expected"]))
else:
m = data.loc[i].as_matrix()
docX.append(m)
docY.append(float(raw.loc[i][:1]["Expected"]))
X , y = np.array(docX) , np.array(docY)
return X,y
In [9]:
raw.index.unique()
Out[9]:
In [62]:
raw.isnull().sum()
Out[62]:
How much observations is there for each ID ?
In [63]:
X,y=getXy(raw)
tmp = []
for i in X:
tmp.append(len(i))
tmp = np.array(tmp)
sns.countplot(tmp,order=range(tmp.min(),tmp.max()+1))
plt.title("Number of ID per number of observations\n(On complete dataset)")
plt.plot()
print("Average gauge observation in mm: %0.2f"%y.mean())
We see there is a lot of ID with 6 or 12 observations, that mean one every 5 or 10 minutes on average.
In [64]:
pd.DataFrame(y).describe()
Out[64]:
Now let's do the analysis on different subsets:
In [31]:
#noAnyNan = raw.loc[raw[features_columns].dropna(how='any').index.unique()]
noAnyNan = raw.dropna()
In [66]:
noAnyNan.isnull().sum()
Out[66]:
In [32]:
X,y=getXy(noAnyNan)
tmp = []
for i in X:
tmp.append(len(i))
tmp = np.array(tmp)
sns.countplot(tmp,order=range(tmp.min(),tmp.max()+1))
plt.title("Number of ID per number of observations\n(On fully filled dataset)")
plt.plot()
print("Average gauge observation in mm: %0.2f"%y.mean())
In [33]:
pd.DataFrame(y).describe()
Out[33]:
In [17]:
noFullNan = raw.loc[raw[features_columns].dropna(how='all').index.unique()]
In [67]:
noFullNan.isnull().sum()
Out[67]:
In [18]:
X,y=getXy(noFullNan)
tmp = []
for i in X:
tmp.append(len(i))
tmp = np.array(tmp)
sns.countplot(tmp,order=range(tmp.min(),tmp.max()+1))
plt.title("Number of ID per number of observations\n(On partly filled dataset)")
plt.plot()
print("Average gauge observation in mm: %0.2f"%y.mean())
In [19]:
pd.DataFrame(y).describe()
Out[19]:
In [69]:
fullNan = raw.drop(raw[features_columns].dropna(how='all').index)
In [68]:
fullNan.isnull().sum()
Out[68]:
In [21]:
X,y=getXy(fullNan)
tmp = []
for i in X:
tmp.append(len(i))
tmp = np.array(tmp)
sns.countplot(tmp,order=range(tmp.min(),tmp.max()+1))
plt.title("Number of ID per number of observations\n(On fully empty dataset)")
plt.plot()
print("Average gauge observation in mm: %0.2f"%y.mean())
In [22]:
pd.DataFrame(y).describe()
Out[22]:
Strangely we notice that the less observations there is, the more it rains on average
However more of the expected rainfall fall below 0.5
What prediction should we make if there is no data?
In [23]:
print("%d observations" %(len(raw)))
#print("%d fully filled, %d partly filled, %d fully empty"
# %(len(noAnyNan),len(noFullNan),len(raw)-len(noFullNan)))
print("%0.1f%% fully filled, %0.1f%% partly filled, %0.1f%% fully empty"
%(len(noAnyNan)/float(len(raw))*100,
len(noFullNan)/float(len(raw))*100,
(len(raw)-len(noFullNan))/float(len(raw))*100))
As a first try, we make predictions on the complete data, and return the 50th percentile and uncomplete and fully empty data
In [25]:
etreg = ExtraTreesRegressor(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0)
In [34]:
X,y=getXy(noAnyNan)
XX = [np.array(t).mean(0) for t in X]
In [35]:
split = 0.2
ps = int(len(XX) * (1-split))
X_train = XX[:ps]
y_train = y[:ps]
X_test = XX[ps:]
y_test = y[ps:]
In [36]:
%%time
etreg.fit(X_train,y_train)
Out[36]:
In [37]:
%%time
et_score = cross_val_score(etreg, XX, y, cv=5)
print("Score: %s\tMean: %.03f"%(et_score,et_score.mean()))
In [38]:
err = (etreg.predict(X_test)-y_test)**2
err.sum()/len(err)
Out[38]:
In [41]:
r = random.randrange(len(X_train))
print(r)
print(etreg.predict(X_train[r]))
print(y_train[r])
In [61]:
r = random.randrange(len(X_test))
print(r)
print(etreg.predict(X_test[r]))
print(y_test[r])
In [72]:
filename = "data/reduced_test_5000.csv"
test = pd.read_csv(filename)
test = test.set_index('Id')
In [71]:
features_columns = list([u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th'])
def getX(raw):
selected_columns = list([ u'radardist_km', u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th'])
data = raw[selected_columns]
docX= []
for i in data.index.unique():
if isinstance(data.loc[i],pd.core.series.Series):
m = [data.loc[i].as_matrix()]
docX.append(m)
else:
m = data.loc[i].as_matrix()
docX.append(m)
X = np.array(docX)
return X
In [74]:
X=getX(test)
tmp = []
for i in X:
tmp.append(len(i))
tmp = np.array(tmp)
sns.countplot(tmp,order=range(tmp.min(),tmp.max()+1))
plt.title("Number of ID per number of observations\n(On test dataset)")
plt.plot()
#print("Average gauge observation in mm: %0.2f"%y.mean())
Out[74]:
In [ ]:
etreg.predict(X_test)
In [84]:
testFull = test.dropna()
In [85]:
X=getX(testFull)
XX = [np.array(t).mean(0) for t in X]
In [86]:
pd.DataFrame(etreg.predict(XX)).describe()
Out[86]:
In [91]:
predFull = zip(testFull.index.unique(),etreg.predict(XX))
In [100]:
b = np.empty(len(a))
b.fill(3.14)
In [101]:
zip(a,b)
Out[101]:
In [104]:
predFull[:10]
Out[104]:
In [102]:
testNan = test.drop(test[features_columns].dropna(how='all').index)
In [116]:
tmp = np.empty(len(testNan))
tmp.fill(0.445000) # 50th percentile of full Nan dataset
predNan = zip(testNan.index.unique(),tmp)
In [106]:
predNan[:10]
Out[106]:
In [109]:
testLeft = test.drop(testNan.index.unique()).drop(testFull.index.unique())
In [117]:
tmp = np.empty(len(testLeft))
tmp.fill(1.27) # 50th percentile of full Nan dataset
predLeft = zip(testLeft.index.unique(),tmp)
In [118]:
len(testFull.index.unique())
Out[118]:
In [119]:
len(testNan.index.unique())
Out[119]:
In [120]:
len(testLeft.index.unique())
Out[120]:
In [122]:
pred = predFull + predNan + predLeft
In [124]:
pred.sort(key=lambda x: x[0], reverse=False)
In [134]:
submission = pd.DataFrame(pred)
submission.columns = ["Id","Expected"]
submission.head()
Out[134]:
In [136]:
submission.to_csv("first_submit.csv",index=False)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: