notebook.community

Edit and run



In [2]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:

    
X_train = pd.read_csv("train.csv")
X_test = pd.read_csv("test.csv")



In [4]:

    
X_train.head()









    Out[4]:






  
    
      
      User_ID
      Product_ID
      Gender
      Age
      Occupation
      City_Category
      Stay_In_Current_City_Years
      Marital_Status
      Product_Category_1
      Product_Category_2
      Product_Category_3
      Purchase
    
  
  
    
      0
      1000001
      P00069042
      F
      0-17
      10
      A
      2
      0
      3
      NaN
      NaN
      8370
    
    
      1
      1000001
      P00248942
      F
      0-17
      10
      A
      2
      0
      1
      6
      14
      15200
    
    
      2
      1000001
      P00087842
      F
      0-17
      10
      A
      2
      0
      12
      NaN
      NaN
      1422
    
    
      3
      1000001
      P00085442
      F
      0-17
      10
      A
      2
      0
      12
      14
      NaN
      1057
    
    
      4
      1000002
      P00285442
      M
      55+
      16
      C
      4+
      0
      8
      NaN
      NaN
      7969



In [5]:

    
X_test.head()









    Out[5]:






  
    
      
      User_ID
      Product_ID
      Gender
      Age
      Occupation
      City_Category
      Stay_In_Current_City_Years
      Marital_Status
      Product_Category_1
      Product_Category_2
      Product_Category_3
    
  
  
    
      0
      1000004
      P00128942
      M
      46-50
      7
      B
      2
      1
      1
      11
      NaN
    
    
      1
      1000009
      P00113442
      M
      26-35
      17
      C
      0
      0
      3
      5
      NaN
    
    
      2
      1000010
      P00288442
      F
      36-45
      1
      B
      4+
      1
      5
      14
      NaN
    
    
      3
      1000010
      P00145342
      F
      36-45
      1
      B
      4+
      1
      4
      9
      NaN
    
    
      4
      1000011
      P00053842
      F
      26-35
      1
      C
      1
      0
      4
      5
      12



In [6]:

    
X_train.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 550068 entries, 0 to 550067
Data columns (total 12 columns):
User_ID                       550068 non-null int64
Product_ID                    550068 non-null object
Gender                        550068 non-null object
Age                           550068 non-null object
Occupation                    550068 non-null int64
City_Category                 550068 non-null object
Stay_In_Current_City_Years    550068 non-null object
Marital_Status                550068 non-null int64
Product_Category_1            550068 non-null int64
Product_Category_2            376430 non-null float64
Product_Category_3            166821 non-null float64
Purchase                      550068 non-null int64
dtypes: float64(2), int64(5), object(5)
memory usage: 54.6+ MB



In [7]:

    
X_test.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 233599 entries, 0 to 233598
Data columns (total 11 columns):
User_ID                       233599 non-null int64
Product_ID                    233599 non-null object
Gender                        233599 non-null object
Age                           233599 non-null object
Occupation                    233599 non-null int64
City_Category                 233599 non-null object
Stay_In_Current_City_Years    233599 non-null object
Marital_Status                233599 non-null int64
Product_Category_1            233599 non-null int64
Product_Category_2            161255 non-null float64
Product_Category_3            71037 non-null float64
dtypes: float64(2), int64(4), object(5)
memory usage: 21.4+ MB



In [8]:

    
X_train.describe()









    Out[8]:






  
    
      
      User_ID
      Occupation
      Marital_Status
      Product_Category_1
      Product_Category_2
      Product_Category_3
      Purchase
    
  
  
    
      count
      550068.000000
      550068.000000
      550068.000000
      550068.000000
      376430.000000
      166821.000000
      550068.000000
    
    
      mean
      1003028.842401
      8.076707
      0.409653
      5.404270
      9.842329
      12.668243
      9263.968713
    
    
      std
      1727.591586
      6.522660
      0.491770
      3.936211
      5.086590
      4.125338
      5023.065394
    
    
      min
      1000001.000000
      0.000000
      0.000000
      1.000000
      2.000000
      3.000000
      12.000000
    
    
      25%
      1001516.000000
      2.000000
      0.000000
      1.000000
      5.000000
      9.000000
      5823.000000
    
    
      50%
      1003077.000000
      7.000000
      0.000000
      5.000000
      9.000000
      14.000000
      8047.000000
    
    
      75%
      1004478.000000
      14.000000
      1.000000
      8.000000
      15.000000
      16.000000
      12054.000000
    
    
      max
      1006040.000000
      20.000000
      1.000000
      20.000000
      18.000000
      18.000000
      23961.000000



In [9]:

    
X_test.describe()









    Out[9]:






  
    
      
      User_ID
      Occupation
      Marital_Status
      Product_Category_1
      Product_Category_2
      Product_Category_3
    
  
  
    
      count
      233599.000000
      233599.000000
      233599.000000
      233599.000000
      161255.000000
      71037.000000
    
    
      mean
      1003029.356859
      8.085407
      0.410070
      5.276542
      9.849586
      12.669454
    
    
      std
      1726.504968
      6.521146
      0.491847
      3.736380
      5.094943
      4.125944
    
    
      min
      1000001.000000
      0.000000
      0.000000
      1.000000
      2.000000
      3.000000
    
    
      25%
      1001527.000000
      2.000000
      0.000000
      1.000000
      5.000000
      9.000000
    
    
      50%
      1003070.000000
      7.000000
      0.000000
      5.000000
      9.000000
      14.000000
    
    
      75%
      1004477.000000
      14.000000
      1.000000
      8.000000
      15.000000
      16.000000
    
    
      max
      1006040.000000
      20.000000
      1.000000
      18.000000
      18.000000
      18.000000



In [10]:

    
print len(X_train["Product_ID"].value_counts())
print len(X_test["Product_ID"].value_counts())



In [11]:

    
X_train["Product_ID"].value_counts().hist()









    Out[11]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f7c9fd485d0>



In [12]:

    
X_test["Product_ID"].value_counts().hist()









    Out[12]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f7c9fd0da10>



In [13]:

    
X_train["Gender"].value_counts()









    Out[13]:





M    414259
F    135809
dtype: int64



In [14]:

    
X_test["Gender"].value_counts()









    Out[14]:





M    175772
F     57827
dtype: int64



In [15]:

    
X_train["Age"].value_counts()









    Out[15]:





26-35    219587
36-45    110013
18-25     99660
46-50     45701
51-55     38501
55+       21504
0-17      15102
dtype: int64



In [16]:

    
X_test["Age"].value_counts()









    Out[16]:





26-35    93428
36-45    46711
18-25    42293
46-50    19577
51-55    16283
55+       9075
0-17      6232
dtype: int64



In [17]:

    
X_train["Occupation"].value_counts()









    Out[17]:





4     72308
0     69638
7     59133
1     47426
17    40043
20    33562
12    31179
14    27309
2     26588
16    25371
6     20355
3     17650
10    12930
5     12177
15    12165
11    11586
19     8461
13     7728
18     6622
9      6291
8      1546
dtype: int64



In [18]:

    
X_test["Occupation"].value_counts()









    Out[18]:





4     30778
0     29212
7     24994
1     20261
17    17375
20    14278
12    13269
14    11473
2     11408
16    10751
6      8747
3      7476
10     5374
5      5196
15     5191
11     5007
19     3458
13     3325
18     2745
9      2638
8       643
dtype: int64



In [19]:

    
X_train["City_Category"].value_counts()









    Out[19]:





B    231173
C    171175
A    147720
dtype: int64



In [20]:

    
X_test["City_Category"].value_counts()









    Out[20]:





B    98566
C    72509
A    62524
dtype: int64



In [21]:

    
X_train["Stay_In_Current_City_Years"].value_counts()









    Out[21]:





1     193821
2     101838
3      95285
4+     84726
0      74398
dtype: int64



In [22]:

    
X_test["Stay_In_Current_City_Years"].value_counts()









    Out[22]:





1     82604
2     43589
3     40143
4+    35945
0     31318
dtype: int64



In [23]:

    
X_train["Marital_Status"].value_counts()









    Out[23]:





0    324731
1    225337
dtype: int64



In [24]:

    
X_test["Marital_Status"].value_counts()









    Out[24]:





0    137807
1     95792
dtype: int64



In [25]:

    
X_train["Purchase"].hist()









    Out[25]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f7c9fbae950>



In [26]:

    
X_train[X_train["Purchase"] == 0]









    Out[26]:






  
    
      
      User_ID
      Product_ID
      Gender
      Age
      Occupation
      City_Category
      Stay_In_Current_City_Years
      Marital_Status
      Product_Category_1
      Product_Category_2
      Product_Category_3
      Purchase



In [73]:

    
X_train["Product_Category_1"].value_counts()









    Out[73]:





5     150933
1     140378
8     113925
11     24287
2      23864
6      20466
3      20213
4      11753
16      9828
15      6290
13      5549
10      5125
12      3947
7       3721
18      3125
20      2550
19      1603
14      1523
17       578
9        410
dtype: int64



In [74]:

    
X_test["Product_Category_1"].value_counts()









    Out[74]:





5     65017
1     60321
8     48369
2     10192
11    10153
6      8860
3      8578
4      5003
16     4105
15     2694
13     2381
10     2248
12     1663
7      1624
18     1311
14      663
17      223
9       194
dtype: int64



In [75]:

    
X_train["Product_Category_2"].value_counts()









    Out[75]:





8     64088
14    55108
2     49217
16    43255
15    37855
5     26235
4     25677
6     16466
11    14134
17    13320
13    10531
9      5693
12     5528
10     3043
3      2884
18     2770
7       626
dtype: int64



In [76]:

    
X_test["Product_Category_2"].value_counts()









    Out[76]:





8     27229
14    23726
2     21281
16    18432
15    16259
4     11028
5     10930
6      7109
11     6096
17     5784
13     4523
9      2484
12     2273
10     1377
18     1257
3      1239
7       228
dtype: int64



In [77]:

    
X_train["Product_Category_3"].value_counts()









    Out[77]:





16    32636
15    28013
14    18428
17    16702
5     16658
8     12562
9     11579
12     9246
13     5459
6      4890
18     4629
4      1875
11     1805
10     1726
3       613
dtype: int64



In [78]:

    
X_test["Product_Category_3"].value_counts()









    Out[78]:





16    13833
15    11955
14     7855
5      7141
17     7116
8      5299
9      4953
12     3869
13     2390
6      1998
18     1992
4       816
11      780
10      775
3       265
dtype: int64



In [8]:

    
X_train["Purchase"].values.sort()



In [11]:

    
X_train["Purchase"].tail(100)









    Out[11]:





549968    23919
549969    23920
549970    23920
549971    23921
549972    23921
549973    23921
549974    23923
549975    23924
549976    23924
549977    23925
549978    23926
549979    23926
549980    23926
549981    23927
549982    23927
549983    23928
549984    23928
549985    23928
549986    23928
549987    23929
549988    23929
549989    23929
549990    23930
549991    23930
549992    23930
549993    23930
549994    23930
549995    23931
549996    23931
549997    23932
          ...  
550038    23948
550039    23948
550040    23949
550041    23949
550042    23949
550043    23949
550044    23950
550045    23951
550046    23952
550047    23953
550048    23953
550049    23954
550050    23954
550051    23955
550052    23955
550053    23955
550054    23956
550055    23958
550056    23958
550057    23958
550058    23958
550059    23959
550060    23959
550061    23960
550062    23960
550063    23960
550064    23960
550065    23961
550066    23961
550067    23961
Name: Purchase, dtype: int64



In [14]:

    
X_train.groupby("Product_Category_3")["Purchase"].mean()









    Out[14]:





Product_Category_3
3     13939.696574
4      9794.386667
5     12117.786889
6     13194.311043
8     13024.918882
9     10431.697210
10    13505.813441
11    12091.437673
12     8715.512762
13    13185.118703
14    10052.594530
15    12339.369900
16    11981.890642
17    11769.943001
18    10993.980773
Name: Purchase, dtype: float64



In [4]:

    
X_train.corr()









    Out[4]:






  
    
      
      User_ID
      Occupation
      Marital_Status
      Product_Category_1
      Product_Category_2
      Product_Category_3
      Purchase
    
  
  
    
      User_ID
      1.000000
      -0.023971
      0.020443
      0.003825
      0.001529
      0.003419
      0.004716
    
    
      Occupation
      -0.023971
      1.000000
      0.024280
      -0.007618
      -0.000384
      0.013263
      0.020833
    
    
      Marital_Status
      0.020443
      0.024280
      1.000000
      0.019888
      0.015138
      0.019473
      -0.000463
    
    
      Product_Category_1
      0.003825
      -0.007618
      0.019888
      1.000000
      0.540583
      0.229678
      -0.343703
    
    
      Product_Category_2
      0.001529
      -0.000384
      0.015138
      0.540583
      1.000000
      0.543649
      -0.209918
    
    
      Product_Category_3
      0.003419
      0.013263
      0.019473
      0.229678
      0.543649
      1.000000
      -0.022006
    
    
      Purchase
      0.004716
      0.020833
      -0.000463
      -0.343703
      -0.209918
      -0.022006
      1.000000



In [6]:

    
X_train.groupby("User_ID")["Purchase"].sum().hist()









    Out[6]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fc5d040d250>



In [8]:

    
X_train.groupby("User_ID")["Purchase"].sum().min()









    Out[8]:





46681



In [11]:

    
X_train.groupby("User_ID")["Purchase"].sum().mean()









    Out[11]:





865016.5917501273



In [14]:

    
X_train.groupby("User_ID")["Purchase"].sum().describe(percentiles=[0.10,0.20,0.30,0.40,0.50,0.60,0.70,0.80,0.90,1.0])









    Out[14]:





count        5891.000000
mean       865016.591750
std        943644.539683
min         46681.000000
10%        146570.000000
20%        205272.000000
30%        279288.000000
40%        383455.000000
50%        521213.000000
60%        698842.000000
70%        942900.000000
80%       1355245.000000
90%       2069404.000000
100%     10536909.000000
max      10536909.000000
Name: Purchase, dtype: float64



In [15]:

    
res = X_train.groupby("User_ID")["Purchase"].sum()



In [16]:

    
res









    Out[16]:





User_ID
1000001     334093
1000002     810472
1000003     341635
1000004     206468
1000005     821001
1000006     379930
1000007     234668
1000008     796593
1000009     594099
1000010    2169510
1000011     557023
1000012     120801
1000013     713927
1000014     127629
1000015    1047728
1000016     150490
1000017    1425995
1000018    1979047
1000019    1458069
1000020     185747
1000021     127099
1000022    1279914
1000023    1670998
1000024     720899
1000025     534706
1000026    1606174
1000027     448934
1000028     516560
1000029     696201
1000030     261584
            ...   
1006011    1198714
1006012     127920
1006013     622847
1006014     528238
1006015     255812
1006016    3770970
1006017     160230
1006018     975585
1006019     604563
1006020     374475
1006021     709448
1006022     278257
1006023    1222210
1006024     827570
1006025    1040257
1006026     490768
1006027     265201
1006028     362972
1006029     157436
1006030     737361
1006031     286374
1006032     517261
1006033     501843
1006034     197086
1006035     956645
1006036    4116058
1006037    1119538
1006038      90034
1006039     590319
1006040    1653299
Name: Purchase, dtype: int64

Trying out if i can shuffel two arrays with dimentions like

a.shape (3, 2, 3)

b.shape (3, 2)

I am trying to shuffel A and b such that the if row 2 of a goes to row 1 of a. Same movement will be done for b



In [3]:

    
a = np.array([[[  0.,   1.,   2.],
                  [  3.,   4.,   5.]],

                 [[  6.,   7.,   8.],
                  [  9.,  10.,  11.]],

                 [[ 12.,  13.,  14.],
                  [ 15.,  16.,  17.]]])

b = np.array([[ 0.,  1.],
                 [ 2.,  3.],
                 [ 4.,  5.]])



In [5]:

    
print a.shape
print b.shape









    



(3, 2, 3)
(3, 2)

Merge to 2 arrays into 1 array



In [22]:

    
c = np.c_[a.reshape(len(a), -1), b.reshape(len(b), -1)]



In [23]:

    
print c









    



[[  0.   1.   2.   3.   4.   5.   0.   1.]
 [  6.   7.   8.   9.  10.  11.   2.   3.]
 [ 12.  13.  14.  15.  16.  17.   4.   5.]]

Extract the 2 arrays out



In [24]:

    
a2 = c[:, :a.size//len(a)].reshape(a.shape)
b2 = c[:, a.size//len(a):].reshape(b.shape)



In [25]:

    
print a2
print b2









    



[[[  0.   1.   2.]
  [  3.   4.   5.]]

 [[  6.   7.   8.]
  [  9.  10.  11.]]

 [[ 12.  13.  14.]
  [ 15.  16.  17.]]]
[[ 0.  1.]
 [ 2.  3.]
 [ 4.  5.]]

Shuffle and see the output.



In [28]:

    
np.random.shuffle(c)



In [29]:

    
print a2
print b2









    



[[[  6.   7.   8.]
  [  9.  10.  11.]]

 [[ 12.  13.  14.]
  [ 15.  16.  17.]]

 [[  0.   1.   2.]
  [  3.   4.   5.]]]
[[ 2.  3.]
 [ 4.  5.]
 [ 0.  1.]]



In [ ]:

	User_ID	Product_ID	Gender	Age	Occupation	City_Category	Stay_In_Current_City_Years	Product_Category_1	Product_Category_2	Product_Category_3	Purchase
0	1000001	P00069042	F	0-17	10	A	2	3	NaN	NaN	8370
1	1000001	P00248942	F	0-17	10	A	2	1	6	14	15200
2	1000001	P00087842	F	0-17	10	A	2	12	NaN	NaN	1422
3	1000001	P00085442	F	0-17	10	A	2	12	14	NaN	1057
4	1000002	P00285442	M	55+	16	C	4+	8	NaN	NaN	7969

	User_ID	Product_ID	Gender	Age	Occupation	City_Category	Stay_In_Current_City_Years	Marital_Status	Product_Category_1	Product_Category_2	Product_Category_3
0	1000004	P00128942	M	46-50	7	B	2	1	1	11	NaN
1	1000009	P00113442	M	26-35	17	C	0	0	3	5	NaN
2	1000010	P00288442	F	36-45	1	B	4+	1	5	14	NaN
3	1000010	P00145342	F	36-45	1	B	4+	1	4	9	NaN
4	1000011	P00053842	F	26-35	1	C	1	0	4	5	12

	User_ID	Occupation	Marital_Status	Product_Category_1	Product_Category_2	Product_Category_3	Purchase
count	550068.000000	550068.000000	550068.000000	550068.000000	376430.000000	166821.000000	550068.000000
mean	1003028.842401	8.076707	0.409653	5.404270	9.842329	12.668243	9263.968713
std	1727.591586	6.522660	0.491770	3.936211	5.086590	4.125338	5023.065394
min	1000001.000000	0.000000	0.000000	1.000000	2.000000	3.000000	12.000000
25%	1001516.000000	2.000000	0.000000	1.000000	5.000000	9.000000	5823.000000
50%	1003077.000000	7.000000	0.000000	5.000000	9.000000	14.000000	8047.000000
75%	1004478.000000	14.000000	1.000000	8.000000	15.000000	16.000000	12054.000000
max	1006040.000000	20.000000	1.000000	20.000000	18.000000	18.000000	23961.000000

	User_ID	Occupation	Marital_Status	Product_Category_1	Product_Category_2	Product_Category_3
count	233599.000000	233599.000000	233599.000000	233599.000000	161255.000000	71037.000000
mean	1003029.356859	8.085407	0.410070	5.276542	9.849586	12.669454
std	1726.504968	6.521146	0.491847	3.736380	5.094943	4.125944
min	1000001.000000	0.000000	0.000000	1.000000	2.000000	3.000000
25%	1001527.000000	2.000000	0.000000	1.000000	5.000000	9.000000
50%	1003070.000000	7.000000	0.000000	5.000000	9.000000	14.000000
75%	1004477.000000	14.000000	1.000000	8.000000	15.000000	16.000000
max	1006040.000000	20.000000	1.000000	18.000000	18.000000	18.000000

	User_ID	Occupation	Marital_Status	Product_Category_1	Product_Category_2	Product_Category_3	Purchase
User_ID	1.000000	-0.023971	0.020443	0.003825	0.001529	0.003419	0.004716
Occupation	-0.023971	1.000000	0.024280	-0.007618	-0.000384	0.013263	0.020833
Marital_Status	0.020443	0.024280	1.000000	0.019888	0.015138	0.019473	-0.000463
Product_Category_1	0.003825	-0.007618	0.019888	1.000000	0.540583	0.229678	-0.343703
Product_Category_2	0.001529	-0.000384	0.015138	0.540583	1.000000	0.543649	-0.209918
Product_Category_3	0.003419	0.013263	0.019473	0.229678	0.543649	1.000000	-0.022006
Purchase	0.004716	0.020833	-0.000463	-0.343703	-0.209918	-0.022006	1.000000