In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import cv2
from sklearn import metrics
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn import cross_validation as cv
from sklearn import svm
from sklearn import ensemble
from sklearn import linear_model
In [2]:
train = pd.read_csv('../data/raw/train.csv')
print train.shape
In [3]:
uniq = train['place_id'].nunique()
print uniq
In [4]:
train_X = train.values[:,:-1]
train_t = train.values[:,-1]
print train_X.shape
print train_t.shape
In [5]:
train.describe()
Out[5]:
In [6]:
train.head()
Out[6]:
In [7]:
train.tail()
Out[7]:
In [ ]:
# train['place_id'].value_counts().plot(kind='bar')
# train['place_id'].value_counts().plot(kind='barh')
In [ ]:
sb.distplot(train['accuracy'], bins=50, kde=False, rug=True);
In [ ]:
sb.distplot(train['accuracy'], hist=False, rug=True);
In [ ]:
with sb.axes_style("white"):
sb.jointplot(x=train['x'], y=train['y'], kind="hex", color="k");
In [ ]:
with sb.axes_style("white"):
sb.jointplot(x=train['accuracy'], y=train['time'], kind="hex", color="k");
Did you specify the type of data analytic question (e.g. exploration, association causality) before touching the data?
Did you define the metric for success before beginning?
Did you understand the context for the question and the scientific or business application? *We are building a system that would rank a list of places given 'coords', 'accuracy' and 'time'. The purpose might be to enable for specific ads (i.e interesting places around the hotel) to be shown to the person (on FB?) depending on this list.
Did you record the experimental design?
Did you consider whether the question could be answered with the available data?
Null values?
What do we know of the measurements?
First column is ID and is useless.
Second and Third are coords., they are in kilometers and are floating point. Min is (0,0) and max is (10,10);
Fourth column is accuracy. Range is (1, 1033) and seems to follow a power law distribution. We assume that this is the accuracy of the location given by the GPS. This claim is supported by the fact that the data comes from a mobile device, which is able to give location but this information is sometimes not accurate (i.e in buildings), so we would like to know what is the accuracy of the reading. In order to convert this into real accuracy, we need to normalize the column and assign it values of (1 - current_val).
The fifth column is time given as a timestamp. What patterns are there?
Last column is the class_id, given as an integer
In [8]:
col_headers = list(train.columns.values)
print col_headers
train[col_headers[1:-1]] = train[col_headers[1:-1]].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
train['accuracy'] = 1 - train['accuracy']
In [9]:
train.describe()
Out[9]:
In [10]:
train.head()
Out[10]:
In [11]:
train.tail()
Out[11]:
In [12]:
train_X_norm = train.values[:,:-1]
print train_X_norm.shape
In [ ]:
K = uniq
clusters = range(0,K)
batch_size = 500
n_init = 10
In [ ]:
random_state = np.random.RandomState(0)
mbk = MiniBatchKMeans(init='k-means++', n_clusters=K, batch_size=batch_size,
n_init=n_init, max_no_improvement=10, verbose=True)
X_kmeans = mbk.fit(train_X_norm)
print "Done!"
In [20]:
import numpy as np
import cv2
from matplotlib import pyplot as plt
X = np.random.randint(25,50,(25,2))
Y = np.random.randint(60,85,(25,2))
Z = np.vstack((X,Y))
# convert to np.float32
Z = np.float32(Z)
print Z.shape
# define criteria and apply kmeans()
criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
#ret,label,center=cv2.kmeans(Z,2,None,criteria,10,cv2.KMEANS_RANDOM_CENTERS)
ret, label, center = cv2.kmeans(Z, 2, criteria, 10, cv2.KMEANS_RANDOM_CENTERS)
# Now separate the data, Note the flatten()
A = Z[label.ravel()==0]
B = Z[label.ravel()==1]
# Plot the data
plt.scatter(A[:,0],A[:,1])
plt.scatter(B[:,0],B[:,1],c = 'r')
plt.scatter(center[:,0],center[:,1],s = 80,c = 'y', marker = 's')
plt.xlabel('Height'),plt.ylabel('Weight')
plt.show()
In [ ]:
train_X_norm = train_X_norm.astype(np.float32)
print train_X_norm.dtype
print train_X_norm.shape
# define criteria and apply kmeans()
criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
ret, label, center = cv2.kmeans(train_X_norm, K, criteria, n_init, cv2.KMEANS_RANDOM_CENTERS)
print center.shape
In [ ]: