In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
X_train = pd.read_csv("train.csv")
X_test = pd.read_csv("test.csv")
In [4]:
X_train.head()
Out[4]:
In [5]:
X_test.head()
Out[5]:
In [6]:
X_train.info()
In [7]:
X_test.info()
In [8]:
X_train.describe()
Out[8]:
In [9]:
X_test.describe()
Out[9]:
In [10]:
print len(X_train["Product_ID"].value_counts())
print len(X_test["Product_ID"].value_counts())
In [11]:
X_train["Product_ID"].value_counts().hist()
Out[11]:
In [12]:
X_test["Product_ID"].value_counts().hist()
Out[12]:
In [13]:
X_train["Gender"].value_counts()
Out[13]:
In [14]:
X_test["Gender"].value_counts()
Out[14]:
In [15]:
X_train["Age"].value_counts()
Out[15]:
In [16]:
X_test["Age"].value_counts()
Out[16]:
In [17]:
X_train["Occupation"].value_counts()
Out[17]:
In [18]:
X_test["Occupation"].value_counts()
Out[18]:
In [19]:
X_train["City_Category"].value_counts()
Out[19]:
In [20]:
X_test["City_Category"].value_counts()
Out[20]:
In [21]:
X_train["Stay_In_Current_City_Years"].value_counts()
Out[21]:
In [22]:
X_test["Stay_In_Current_City_Years"].value_counts()
Out[22]:
In [23]:
X_train["Marital_Status"].value_counts()
Out[23]:
In [24]:
X_test["Marital_Status"].value_counts()
Out[24]:
In [25]:
X_train["Purchase"].hist()
Out[25]:
In [26]:
X_train[X_train["Purchase"] == 0]
Out[26]:
In [73]:
X_train["Product_Category_1"].value_counts()
Out[73]:
In [74]:
X_test["Product_Category_1"].value_counts()
Out[74]:
In [75]:
X_train["Product_Category_2"].value_counts()
Out[75]:
In [76]:
X_test["Product_Category_2"].value_counts()
Out[76]:
In [77]:
X_train["Product_Category_3"].value_counts()
Out[77]:
In [78]:
X_test["Product_Category_3"].value_counts()
Out[78]:
In [8]:
X_train["Purchase"].values.sort()
In [11]:
X_train["Purchase"].tail(100)
Out[11]:
In [14]:
X_train.groupby("Product_Category_3")["Purchase"].mean()
Out[14]:
In [4]:
X_train.corr()
Out[4]:
In [6]:
X_train.groupby("User_ID")["Purchase"].sum().hist()
Out[6]:
In [8]:
X_train.groupby("User_ID")["Purchase"].sum().min()
Out[8]:
In [11]:
X_train.groupby("User_ID")["Purchase"].sum().mean()
Out[11]:
In [14]:
X_train.groupby("User_ID")["Purchase"].sum().describe(percentiles=[0.10,0.20,0.30,0.40,0.50,0.60,0.70,0.80,0.90,1.0])
Out[14]:
In [15]:
res = X_train.groupby("User_ID")["Purchase"].sum()
In [16]:
res
Out[16]:
In [3]:
a = np.array([[[ 0., 1., 2.],
[ 3., 4., 5.]],
[[ 6., 7., 8.],
[ 9., 10., 11.]],
[[ 12., 13., 14.],
[ 15., 16., 17.]]])
b = np.array([[ 0., 1.],
[ 2., 3.],
[ 4., 5.]])
In [5]:
print a.shape
print b.shape
In [22]:
c = np.c_[a.reshape(len(a), -1), b.reshape(len(b), -1)]
In [23]:
print c
In [24]:
a2 = c[:, :a.size//len(a)].reshape(a.shape)
b2 = c[:, a.size//len(a):].reshape(b.shape)
In [25]:
print a2
print b2
In [28]:
np.random.shuffle(c)
In [29]:
print a2
print b2
In [ ]: