In [15]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
%matplotlib inline
pd.options.display.mpl_style = 'default'
data_id1 = 19
data_id2 = 20
Xtrain1 = pd.read_csv('/home/loschen/Desktop/datamining-kaggle/numerai/data/numerai_datasets_'+str(data_id1)+'/numerai_training_data.csv')
Xtrain2 = pd.read_csv('/home/loschen/Desktop/datamining-kaggle/numerai/data/numerai_datasets_'+str(data_id2)+'/numerai_training_data.csv')
Xtrain1.drop(['target'],axis=1,inplace=True)
Xtrain2.drop(['target'],axis=1,inplace=True)
Xtrain2.columns = [x+'_b' for x in Xtrain2.columns]
print Xtrain1.shape
print Xtrain2.shape
#Xtrain = pd.concat([Xtrain[['feature1']], Xtrain2], ignore_index=True)
In [16]:
corr_train = pd.DataFrame(Xtrain1).corr()
#corr_pred = pd.DataFrame(X_pred).corr()
#corr_both = pd.DataFrame(X_both).corr()
sns.set(context="paper", font="monospace")
m = sns.clustermap(corr_train)
corr_train2 = pd.DataFrame(Xtrain2).corr()
#corr_pred = pd.DataFrame(X_pred).corr()
#corr_both = pd.DataFrame(X_both).corr()
m2 = sns.clustermap(corr_train2)
print "sorted 1:",m.data2d.columns
print "sorted 2:",m2.data2d.columns
In [11]:
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
Xf1 = Xtrain1.values.T
Xf2 = Xtrain2.values.T
est1 = KMeans(n_clusters=7, n_jobs=4)
est1.fit(Xf1)
labels1 = est1.labels_
est2 = KMeans(n_clusters=7, n_jobs=4)
est2.fit(Xf2)
labels2 = est2.labels_
clustered1 = []
clustered2 = []
for i in xrange(7):
clustered1.append(list(Xtrain1.columns[labels1==i].values))
clustered2.append(list(Xtrain2.columns[labels2==i].values))
print clustered1
print clustered2
In [ ]:
for c1 in clustered1:
for c2 in clustered1:
print c1
print c2
print
cvm = np.cov(Xtrain1.loc[:100,c1].values,Xtrain1.loc[:100,c2].values,rowvar=False)
print cvm.shape
print cvm
raw_input()
In [19]:
Xtrain.iloc[:,:].hist(bins=20)
plt.show()
Xtrain[['feature1','feature2','feature3']].plot(kind='box')
Out[19]:
In [41]:
pd.rolling_mean(Xtrain['feature2'],2000).plot()
Out[41]:
In [37]:
pd.rolling_mean(Xtrain.iloc[2,:],2).plot()
Out[37]:
In [29]:
Xtrain.iloc[0,:].plot()
Out[29]:
In [57]:
rm = Xtrain.apply(pd.rolling_mean,args=(1000),axis=0)
In [53]:
rm.shape
Out[53]:
In [54]:
type(rm)
Out[54]:
rm
In [55]:
rm
Out[55]:
In [56]:
pd.rolling_count
In [ ]: