In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
X_train = pd.read_csv("train.csv")
X_test = pd.read_csv("test.csv")

In [4]:
X_train.head()


Out[4]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6 14 15200
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422
3 1000001 P00085442 F 0-17 10 A 2 0 12 14 NaN 1057
4 1000002 P00285442 M 55+ 16 C 4+ 0 8 NaN NaN 7969

In [5]:
X_test.head()


Out[5]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3
0 1000004 P00128942 M 46-50 7 B 2 1 1 11 NaN
1 1000009 P00113442 M 26-35 17 C 0 0 3 5 NaN
2 1000010 P00288442 F 36-45 1 B 4+ 1 5 14 NaN
3 1000010 P00145342 F 36-45 1 B 4+ 1 4 9 NaN
4 1000011 P00053842 F 26-35 1 C 1 0 4 5 12

In [6]:
X_train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 550068 entries, 0 to 550067
Data columns (total 12 columns):
User_ID                       550068 non-null int64
Product_ID                    550068 non-null object
Gender                        550068 non-null object
Age                           550068 non-null object
Occupation                    550068 non-null int64
City_Category                 550068 non-null object
Stay_In_Current_City_Years    550068 non-null object
Marital_Status                550068 non-null int64
Product_Category_1            550068 non-null int64
Product_Category_2            376430 non-null float64
Product_Category_3            166821 non-null float64
Purchase                      550068 non-null int64
dtypes: float64(2), int64(5), object(5)
memory usage: 54.6+ MB

In [7]:
X_test.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 233599 entries, 0 to 233598
Data columns (total 11 columns):
User_ID                       233599 non-null int64
Product_ID                    233599 non-null object
Gender                        233599 non-null object
Age                           233599 non-null object
Occupation                    233599 non-null int64
City_Category                 233599 non-null object
Stay_In_Current_City_Years    233599 non-null object
Marital_Status                233599 non-null int64
Product_Category_1            233599 non-null int64
Product_Category_2            161255 non-null float64
Product_Category_3            71037 non-null float64
dtypes: float64(2), int64(4), object(5)
memory usage: 21.4+ MB

In [8]:
X_train.describe()


Out[8]:
User_ID Occupation Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
count 550068.000000 550068.000000 550068.000000 550068.000000 376430.000000 166821.000000 550068.000000
mean 1003028.842401 8.076707 0.409653 5.404270 9.842329 12.668243 9263.968713
std 1727.591586 6.522660 0.491770 3.936211 5.086590 4.125338 5023.065394
min 1000001.000000 0.000000 0.000000 1.000000 2.000000 3.000000 12.000000
25% 1001516.000000 2.000000 0.000000 1.000000 5.000000 9.000000 5823.000000
50% 1003077.000000 7.000000 0.000000 5.000000 9.000000 14.000000 8047.000000
75% 1004478.000000 14.000000 1.000000 8.000000 15.000000 16.000000 12054.000000
max 1006040.000000 20.000000 1.000000 20.000000 18.000000 18.000000 23961.000000

In [9]:
X_test.describe()


Out[9]:
User_ID Occupation Marital_Status Product_Category_1 Product_Category_2 Product_Category_3
count 233599.000000 233599.000000 233599.000000 233599.000000 161255.000000 71037.000000
mean 1003029.356859 8.085407 0.410070 5.276542 9.849586 12.669454
std 1726.504968 6.521146 0.491847 3.736380 5.094943 4.125944
min 1000001.000000 0.000000 0.000000 1.000000 2.000000 3.000000
25% 1001527.000000 2.000000 0.000000 1.000000 5.000000 9.000000
50% 1003070.000000 7.000000 0.000000 5.000000 9.000000 14.000000
75% 1004477.000000 14.000000 1.000000 8.000000 15.000000 16.000000
max 1006040.000000 20.000000 1.000000 18.000000 18.000000 18.000000

In [10]:
print len(X_train["Product_ID"].value_counts())
print len(X_test["Product_ID"].value_counts())


3631
3491

In [11]:
X_train["Product_ID"].value_counts().hist()


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f7c9fd485d0>

In [12]:
X_test["Product_ID"].value_counts().hist()


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f7c9fd0da10>

In [13]:
X_train["Gender"].value_counts()


Out[13]:
M    414259
F    135809
dtype: int64

In [14]:
X_test["Gender"].value_counts()


Out[14]:
M    175772
F     57827
dtype: int64

In [15]:
X_train["Age"].value_counts()


Out[15]:
26-35    219587
36-45    110013
18-25     99660
46-50     45701
51-55     38501
55+       21504
0-17      15102
dtype: int64

In [16]:
X_test["Age"].value_counts()


Out[16]:
26-35    93428
36-45    46711
18-25    42293
46-50    19577
51-55    16283
55+       9075
0-17      6232
dtype: int64

In [17]:
X_train["Occupation"].value_counts()


Out[17]:
4     72308
0     69638
7     59133
1     47426
17    40043
20    33562
12    31179
14    27309
2     26588
16    25371
6     20355
3     17650
10    12930
5     12177
15    12165
11    11586
19     8461
13     7728
18     6622
9      6291
8      1546
dtype: int64

In [18]:
X_test["Occupation"].value_counts()


Out[18]:
4     30778
0     29212
7     24994
1     20261
17    17375
20    14278
12    13269
14    11473
2     11408
16    10751
6      8747
3      7476
10     5374
5      5196
15     5191
11     5007
19     3458
13     3325
18     2745
9      2638
8       643
dtype: int64

In [19]:
X_train["City_Category"].value_counts()


Out[19]:
B    231173
C    171175
A    147720
dtype: int64

In [20]:
X_test["City_Category"].value_counts()


Out[20]:
B    98566
C    72509
A    62524
dtype: int64

In [21]:
X_train["Stay_In_Current_City_Years"].value_counts()


Out[21]:
1     193821
2     101838
3      95285
4+     84726
0      74398
dtype: int64

In [22]:
X_test["Stay_In_Current_City_Years"].value_counts()


Out[22]:
1     82604
2     43589
3     40143
4+    35945
0     31318
dtype: int64

In [23]:
X_train["Marital_Status"].value_counts()


Out[23]:
0    324731
1    225337
dtype: int64

In [24]:
X_test["Marital_Status"].value_counts()


Out[24]:
0    137807
1     95792
dtype: int64

In [25]:
X_train["Purchase"].hist()


Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f7c9fbae950>

In [26]:
X_train[X_train["Purchase"] == 0]


Out[26]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase

In [73]:
X_train["Product_Category_1"].value_counts()


Out[73]:
5     150933
1     140378
8     113925
11     24287
2      23864
6      20466
3      20213
4      11753
16      9828
15      6290
13      5549
10      5125
12      3947
7       3721
18      3125
20      2550
19      1603
14      1523
17       578
9        410
dtype: int64

In [74]:
X_test["Product_Category_1"].value_counts()


Out[74]:
5     65017
1     60321
8     48369
2     10192
11    10153
6      8860
3      8578
4      5003
16     4105
15     2694
13     2381
10     2248
12     1663
7      1624
18     1311
14      663
17      223
9       194
dtype: int64

In [75]:
X_train["Product_Category_2"].value_counts()


Out[75]:
8     64088
14    55108
2     49217
16    43255
15    37855
5     26235
4     25677
6     16466
11    14134
17    13320
13    10531
9      5693
12     5528
10     3043
3      2884
18     2770
7       626
dtype: int64

In [76]:
X_test["Product_Category_2"].value_counts()


Out[76]:
8     27229
14    23726
2     21281
16    18432
15    16259
4     11028
5     10930
6      7109
11     6096
17     5784
13     4523
9      2484
12     2273
10     1377
18     1257
3      1239
7       228
dtype: int64

In [77]:
X_train["Product_Category_3"].value_counts()


Out[77]:
16    32636
15    28013
14    18428
17    16702
5     16658
8     12562
9     11579
12     9246
13     5459
6      4890
18     4629
4      1875
11     1805
10     1726
3       613
dtype: int64

In [78]:
X_test["Product_Category_3"].value_counts()


Out[78]:
16    13833
15    11955
14     7855
5      7141
17     7116
8      5299
9      4953
12     3869
13     2390
6      1998
18     1992
4       816
11      780
10      775
3       265
dtype: int64

In [8]:
X_train["Purchase"].values.sort()

In [11]:
X_train["Purchase"].tail(100)


Out[11]:
549968    23919
549969    23920
549970    23920
549971    23921
549972    23921
549973    23921
549974    23923
549975    23924
549976    23924
549977    23925
549978    23926
549979    23926
549980    23926
549981    23927
549982    23927
549983    23928
549984    23928
549985    23928
549986    23928
549987    23929
549988    23929
549989    23929
549990    23930
549991    23930
549992    23930
549993    23930
549994    23930
549995    23931
549996    23931
549997    23932
          ...  
550038    23948
550039    23948
550040    23949
550041    23949
550042    23949
550043    23949
550044    23950
550045    23951
550046    23952
550047    23953
550048    23953
550049    23954
550050    23954
550051    23955
550052    23955
550053    23955
550054    23956
550055    23958
550056    23958
550057    23958
550058    23958
550059    23959
550060    23959
550061    23960
550062    23960
550063    23960
550064    23960
550065    23961
550066    23961
550067    23961
Name: Purchase, dtype: int64

In [14]:
X_train.groupby("Product_Category_3")["Purchase"].mean()


Out[14]:
Product_Category_3
3     13939.696574
4      9794.386667
5     12117.786889
6     13194.311043
8     13024.918882
9     10431.697210
10    13505.813441
11    12091.437673
12     8715.512762
13    13185.118703
14    10052.594530
15    12339.369900
16    11981.890642
17    11769.943001
18    10993.980773
Name: Purchase, dtype: float64

In [4]:
X_train.corr()


Out[4]:
User_ID Occupation Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
User_ID 1.000000 -0.023971 0.020443 0.003825 0.001529 0.003419 0.004716
Occupation -0.023971 1.000000 0.024280 -0.007618 -0.000384 0.013263 0.020833
Marital_Status 0.020443 0.024280 1.000000 0.019888 0.015138 0.019473 -0.000463
Product_Category_1 0.003825 -0.007618 0.019888 1.000000 0.540583 0.229678 -0.343703
Product_Category_2 0.001529 -0.000384 0.015138 0.540583 1.000000 0.543649 -0.209918
Product_Category_3 0.003419 0.013263 0.019473 0.229678 0.543649 1.000000 -0.022006
Purchase 0.004716 0.020833 -0.000463 -0.343703 -0.209918 -0.022006 1.000000

In [6]:
X_train.groupby("User_ID")["Purchase"].sum().hist()


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fc5d040d250>

In [8]:
X_train.groupby("User_ID")["Purchase"].sum().min()


Out[8]:
46681

In [11]:
X_train.groupby("User_ID")["Purchase"].sum().mean()


Out[11]:
865016.5917501273

In [14]:
X_train.groupby("User_ID")["Purchase"].sum().describe(percentiles=[0.10,0.20,0.30,0.40,0.50,0.60,0.70,0.80,0.90,1.0])


Out[14]:
count        5891.000000
mean       865016.591750
std        943644.539683
min         46681.000000
10%        146570.000000
20%        205272.000000
30%        279288.000000
40%        383455.000000
50%        521213.000000
60%        698842.000000
70%        942900.000000
80%       1355245.000000
90%       2069404.000000
100%     10536909.000000
max      10536909.000000
Name: Purchase, dtype: float64

In [15]:
res = X_train.groupby("User_ID")["Purchase"].sum()

In [16]:
res


Out[16]:
User_ID
1000001     334093
1000002     810472
1000003     341635
1000004     206468
1000005     821001
1000006     379930
1000007     234668
1000008     796593
1000009     594099
1000010    2169510
1000011     557023
1000012     120801
1000013     713927
1000014     127629
1000015    1047728
1000016     150490
1000017    1425995
1000018    1979047
1000019    1458069
1000020     185747
1000021     127099
1000022    1279914
1000023    1670998
1000024     720899
1000025     534706
1000026    1606174
1000027     448934
1000028     516560
1000029     696201
1000030     261584
            ...   
1006011    1198714
1006012     127920
1006013     622847
1006014     528238
1006015     255812
1006016    3770970
1006017     160230
1006018     975585
1006019     604563
1006020     374475
1006021     709448
1006022     278257
1006023    1222210
1006024     827570
1006025    1040257
1006026     490768
1006027     265201
1006028     362972
1006029     157436
1006030     737361
1006031     286374
1006032     517261
1006033     501843
1006034     197086
1006035     956645
1006036    4116058
1006037    1119538
1006038      90034
1006039     590319
1006040    1653299
Name: Purchase, dtype: int64

Trying out if i can shuffel two arrays with dimentions like

a.shape (3, 2, 3)

b.shape (3, 2)

I am trying to shuffel A and b such that the if row 2 of a goes to row 1 of a. Same movement will be done for b


In [3]:
a = np.array([[[  0.,   1.,   2.],
                  [  3.,   4.,   5.]],

                 [[  6.,   7.,   8.],
                  [  9.,  10.,  11.]],

                 [[ 12.,  13.,  14.],
                  [ 15.,  16.,  17.]]])

b = np.array([[ 0.,  1.],
                 [ 2.,  3.],
                 [ 4.,  5.]])

In [5]:
print a.shape
print b.shape


(3, 2, 3)
(3, 2)

Merge to 2 arrays into 1 array


In [22]:
c = np.c_[a.reshape(len(a), -1), b.reshape(len(b), -1)]

In [23]:
print c


[[  0.   1.   2.   3.   4.   5.   0.   1.]
 [  6.   7.   8.   9.  10.  11.   2.   3.]
 [ 12.  13.  14.  15.  16.  17.   4.   5.]]

Extract the 2 arrays out


In [24]:
a2 = c[:, :a.size//len(a)].reshape(a.shape)
b2 = c[:, a.size//len(a):].reshape(b.shape)

In [25]:
print a2
print b2


[[[  0.   1.   2.]
  [  3.   4.   5.]]

 [[  6.   7.   8.]
  [  9.  10.  11.]]

 [[ 12.  13.  14.]
  [ 15.  16.  17.]]]
[[ 0.  1.]
 [ 2.  3.]
 [ 4.  5.]]

Shuffle and see the output.


In [28]:
np.random.shuffle(c)

In [29]:
print a2
print b2


[[[  6.   7.   8.]
  [  9.  10.  11.]]

 [[ 12.  13.  14.]
  [ 15.  16.  17.]]

 [[  0.   1.   2.]
  [  3.   4.   5.]]]
[[ 2.  3.]
 [ 4.  5.]
 [ 0.  1.]]

In [ ]: