In [1]:

    
%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

Predict whether a mobile ad will be clicked

In online advertising, click-through rate (CTR) is a very important metric for evaluating ad performance.

Data

This competition provides 11 days worth of Avazu data to build and test prediction models.

Additional details:

'train_rev1.csv' training set. 10 days of click-through data, ordered chronologically.
'test_rev1.csv' test set. 1 day of ads for testing predictions.
fields
- id: ad identifier
- click: 0/1 for non-click/click
- hour: format YYMMDDHH, e.g. 14091123 means 23:00 on Sept. 11, 2014

Explore

Experiment with a subset of the training data.



In [1]:

    
!head -n 400001 data/train_rev2.csv > data/train_sub.csv



In [2]:

    
train = pd.read_csv("data/train_sub.csv")









    



/Users/leigong/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/pandas-0.14.1-py2.7-macosx-10.6-x86_64.egg/pandas/io/parsers.py:1139: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)



In [3]:

    
train.head(10)









    Out[3]:






  
    
      
      id
      click
      hour
      C1
      banner_pos
      site_id
      site_domain
      site_category
      app_id
      app_domain
      ...
      device_conn_type
      device_geo_country
      C17
      C18
      C19
      C20
      C21
      C22
      C23
      C24
    
  
  
    
      0
       10000222510487979663
       0
       14100100
       1005
       0
       d41d8cd9
       d41d8cd9
       d41d8cd9
       ee72efa5
       85262c2b
      ...
       0
       fc9fdf08
       11999
       320
       50
       1248
       2
        39
           -1
       13
    
    
      1
       10000335031004381249
       0
       14100100
       1005
       0
       d41d8cd9
       d41d8cd9
       d41d8cd9
       7ddd1e29
       85262c2b
      ...
       0
       e22428cc
       12026
       320
       50
       1248
       2
        39
           -1
       13
    
    
      2
       10000413097548171036
       0
       14100100
       1010
       1
       d41d8cd9
       d41d8cd9
       d41d8cd9
       7dd0bcc4
       d41d8cd9
      ...
       2
       5343b21a
        5470
       320
       50
        394
       2
       303
           -1
       15
    
    
      3
       10000436876114817886
       0
       14100100
       1002
       0
       d5589b4a
       d41d8cd9
       d41d8cd9
       d41d8cd9
       d41d8cd9
      ...
       0
       0b3b97fa
       16723
       320
       50
       1876
       2
       291
           -1
       33
    
    
      4
       10000488446663934007
       1
       14100100
       1005
       0
       d41d8cd9
       d41d8cd9
       d41d8cd9
       aa55fc10
       85262c2b
      ...
       2
       75778bf8
       17012
       320
       50
       1871
       3
        35
       100053
       23
    
    
      5
       10000501900095337384
       0
       14100100
       1005
       0
       d41d8cd9
       d41d8cd9
       d41d8cd9
       759472d5
       2029219a
      ...
       0
       959848ca
       14903
       320
       50
       1622
       3
       175
       100156
       42
    
    
      6
       10000531960943040822
       0
       14100100
       1005
       0
       d41d8cd9
       d41d8cd9
       d41d8cd9
       ef6e6261
       85262c2b
      ...
       0
       afbe94cd
       16688
       320
       50
       1873
       3
        35
           -1
       23
    
    
      7
       10000832723983198840
       0
       14100100
       1005
       0
       d41d8cd9
       d41d8cd9
       d41d8cd9
       96b8c96a
       4f04e5f8
      ...
       0
       0b3b97fa
       16858
       320
       50
       1887
       3
        39
           -1
       23
    
    
      8
       10000891345123252859
       0
       14100100
       1005
       1
       de28ac8c
       034212b6
       7e5068fc
       d41d8cd9
       d41d8cd9
      ...
       0
       13b5bfe9
       16208
       320
       50
       1800
       3
        47
           -1
       23
    
    
      9
       10000928132460538981
       0
       14100100
       1005
       1
       09ab1430
       a56a5285
       7e5068fc
       d41d8cd9
       d41d8cd9
      ...
       0
       c12e01f2
       17037
       320
       50
       1934
       2
        35
           -1
       16
    
  

10 rows × 27 columns



In [4]:

    
train.shape









    Out[4]:





(599999, 27)



In [5]:

    
train.columns









    Out[5]:





Index([u'id', u'click', u'hour', u'C1', u'banner_pos', u'site_id', u'site_domain', u'site_category', u'app_id', u'app_domain', u'app_category', u'device_id', u'device_ip', u'device_os', u'device_make', u'device_model', u'device_type', u'device_conn_type', u'device_geo_country', u'C17', u'C18', u'C19', u'C20', u'C21', u'C22', u'C23', u'C24'], dtype='object')

Total # of clicks/non-clicks were targeted at 200K.



In [13]:

    
train.groupby('hour').count()









    Out[13]:






  
    
      
      id
      click
      C1
      banner_pos
      site_id
      site_domain
      site_category
      app_id
      app_domain
      app_category
      ...
      device_conn_type
      device_geo_country
      C17
      C18
      C19
      C20
      C21
      C22
      C23
      C24
    
    
      hour
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      14100100
       199229
       199229
       199229
       199229
       199229
       199229
       199229
       199229
       199229
       199229
      ...
       199229
       199229
       199229
       199229
       199229
       199229
       199229
       199229
       199229
       199229
    
    
      14100101
       199025
       199025
       199025
       199025
       199025
       199025
       199025
       199025
       199025
       199025
      ...
       199025
       199025
       199025
       199025
       199025
       199025
       199025
       199025
       199025
       199025
    
    
      14100102
       198692
       198692
       198692
       198692
       198692
       198692
       198692
       198692
       198692
       198692
      ...
       198692
       198692
       198692
       198692
       198692
       198692
       198692
       198692
       198692
       198692
    
    
      14100103
         3053
         3053
         3053
         3053
         3053
         3053
         3053
         3053
         3053
         3053
      ...
         3053
         3053
         3053
         3053
         3053
         3053
         3053
         3053
         3053
         3053
    
  

4 rows × 26 columns

# of clicks are not exact 200K due to hash collision. No NANs exist.

Subsampling

Some thoughts so far:

the problem is of the nature of extrapolation, i.e. use the first 10 days to infer the 11th day
cannot load entire dataset into memory
- subsample click/non-click per hour data into 10 sets -> 8.7Gb to 0.87Gb
- ensemble models from 10 sets
- for each subset, use the first 9 days as training and the 10th day as test



In [2]:

    
# subsample train_rev2.csv into 10 subsets
#%run subsample.py # CAUTION! it takes a while

!ls -l data/train*









    



-rw-r--r--  1 leigong  staff   914299443 Nov  4 17:11 data/train_0.csv
-rw-r--r--  1 leigong  staff   914309832 Nov  4 17:11 data/train_1.csv
-rw-r--r--  1 leigong  staff   914310928 Nov  4 17:11 data/train_2.csv
-rw-r--r--  1 leigong  staff   914299893 Nov  4 17:11 data/train_3.csv
-rw-r--r--  1 leigong  staff   914323726 Nov  4 17:11 data/train_4.csv
-rw-r--r--  1 leigong  staff   914299284 Nov  4 17:11 data/train_5.csv
-rw-r--r--  1 leigong  staff   914302130 Nov  4 17:11 data/train_6.csv
-rw-r--r--  1 leigong  staff   914302597 Nov  4 17:11 data/train_7.csv
-rw-r--r--  1 leigong  staff   914306085 Nov  4 17:11 data/train_8.csv
-rw-r--r--  1 leigong  staff   914307267 Nov  4 17:11 data/train_9.csv
-rw-r--r--@ 1 leigong  staff  8704603176 Oct 30 23:14 data/train_rev2.csv



In [3]:

    
train0 = pd.read_csv('data/train_0.csv')



In [34]:

    
train0.shape









    Out[34]:





(4748679, 29)



In [5]:

    
train0.head()









    Out[5]:






  
    
      
      id
      click
      hour
      C1
      banner_pos
      site_id
      site_domain
      site_category
      app_id
      app_domain
      ...
      C17
      C18
      C19
      C20
      C21
      C22
      C23
      C24

      day
      short_hour
    
  
  
    
      0
        5649318912099309032
       0
       14100100
       1005
       0
       d41d8cd9
       d41d8cd9
       d41d8cd9
       3e123914
       85262c2b
      ...
       16692
       320
        50
        538
       3
        39
           -1
        23
       1
       0
    
    
      1
        5556718111690710424
       0
       14100100
       1005
       1
       a3cfee0f
       0ab81953
       a89b3dd2
       d41d8cd9
       d41d8cd9
      ...
         487
       320
        50
        110
       2
        35
           -1
        16
       1
       0
    
    
      2
        9940524301330922678
       0
       14100100
       1005
       0
       5fa09cb7
       8704e43a
       7e5068fc
       d41d8cd9
       d41d8cd9
      ...
       16739
       320
        50
       1877
       2
       291
           -1
        33
       1
       0
    
    
      3
        5633074668823491084
       0
       14100100
       1005
       0
       d41d8cd9
       d41d8cd9
       d41d8cd9
       540ffd21
       d41d8cd9
      ...
       17035
       300
       250
       1699
       0
        35
       100077
       110
       1
       0
    
    
      4
       13542629736491476466
       0
       14100100
       1005
       1
       09ab1430
       a56a5285
       7e5068fc
       d41d8cd9
       d41d8cd9
      ...
       16959
       320
        50
       1916
       2
        39
       100084
       117
       1
       0
    
  

5 rows × 29 columns

Pre-processing

Encode 'hour' into 'day' and 'short_hour'. That is, extract information from the attribute 'hour'.



In [6]:

    
train0.groupby('day').count()









    Out[6]:






  
    
      
      id
      click
      hour
      C1
      banner_pos
      site_id
      site_domain
      site_category
      app_id
      app_domain
      ...
      device_geo_country
      C17
      C18
      C19
      C20
      C21
      C22
      C23
      C24

      short_hour
    
    
      day
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1 
       476194
       476194
       476194
       476194
       476194
       476194
       476194
       476194
       476194
       476194
      ...
       476194
       476194
       476194
       476194
       476194
       476194
       476194
       476194
       476194
       476194
    
    
      2 
       476529
       476529
       476529
       476529
       476529
       476529
       476529
       476529
       476529
       476529
      ...
       476529
       476529
       476529
       476529
       476529
       476529
       476529
       476529
       476529
       476529
    
    
      3 
       476803
       476803
       476803
       476803
       476803
       476803
       476803
       476803
       476803
       476803
      ...
       476803
       476803
       476803
       476803
       476803
       476803
       476803
       476803
       476803
       476803
    
    
      4 
       477649
       477649
       477649
       477649
       477649
       477649
       477649
       477649
       477649
       477649
      ...
       477649
       477649
       477649
       477649
       477649
       477649
       477649
       477649
       477649
       477649
    
    
      5 
       477638
       477638
       477638
       477638
       477638
       477638
       477638
       477638
       477638
       477638
      ...
       477638
       477638
       477638
       477638
       477638
       477638
       477638
       477638
       477638
       477638
    
    
      6 
       476983
       476983
       476983
       476983
       476983
       476983
       476983
       476983
       476983
       476983
      ...
       476983
       476983
       476983
       476983
       476983
       476983
       476983
       476983
       476983
       476983
    
    
      7 
       476922
       476922
       476922
       476922
       476922
       476922
       476922
       476922
       476922
       476922
      ...
       476922
       476922
       476922
       476922
       476922
       476922
       476922
       476922
       476922
       476922
    
    
      8 
       476613
       476613
       476613
       476613
       476613
       476613
       476613
       476613
       476613
       476613
      ...
       476613
       476613
       476613
       476613
       476613
       476613
       476613
       476613
       476613
       476613
    
    
      9 
       476679
       476679
       476679
       476679
       476679
       476679
       476679
       476679
       476679
       476679
      ...
       476679
       476679
       476679
       476679
       476679
       476679
       476679
       476679
       476679
       476679
    
    
      10
       456669
       456669
       456669
       456669
       456669
       456669
       456669
       456669
       456669
       456669
      ...
       456669
       456669
       456669
       456669
       456669
       456669
       456669
       456669
       456669
       456669
    
  

10 rows × 28 columns



In [7]:

    
train0.groupby('short_hour').count()









    Out[7]:






  
    
      
      id
      click
      hour
      C1
      banner_pos
      site_id
      site_domain
      site_category
      app_id
      app_domain
      ...
      device_geo_country
      C17
      C18
      C19
      C20
      C21
      C22
      C23
      C24

      day
    
    
      short_hour
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0 
       199255
       199255
       199255
       199255
       199255
       199255
       199255
       199255
       199255
       199255
      ...
       199255
       199255
       199255
       199255
       199255
       199255
       199255
       199255
       199255
       199255
    
    
      1 
       199139
       199139
       199139
       199139
       199139
       199139
       199139
       199139
       199139
       199139
      ...
       199139
       199139
       199139
       199139
       199139
       199139
       199139
       199139
       199139
       199139
    
    
      2 
       199104
       199104
       199104
       199104
       199104
       199104
       199104
       199104
       199104
       199104
      ...
       199104
       199104
       199104
       199104
       199104
       199104
       199104
       199104
       199104
       199104
    
    
      3 
       199005
       199005
       199005
       199005
       199005
       199005
       199005
       199005
       199005
       199005
      ...
       199005
       199005
       199005
       199005
       199005
       199005
       199005
       199005
       199005
       199005
    
    
      4 
       198799
       198799
       198799
       198799
       198799
       198799
       198799
       198799
       198799
       198799
      ...
       198799
       198799
       198799
       198799
       198799
       198799
       198799
       198799
       198799
       198799
    
    
      5 
       198669
       198669
       198669
       198669
       198669
       198669
       198669
       198669
       198669
       198669
      ...
       198669
       198669
       198669
       198669
       198669
       198669
       198669
       198669
       198669
       198669
    
    
      6 
       198564
       198564
       198564
       198564
       198564
       198564
       198564
       198564
       198564
       198564
      ...
       198564
       198564
       198564
       198564
       198564
       198564
       198564
       198564
       198564
       198564
    
    
      7 
       198547
       198547
       198547
       198547
       198547
       198547
       198547
       198547
       198547
       198547
      ...
       198547
       198547
       198547
       198547
       198547
       198547
       198547
       198547
       198547
       198547
    
    
      8 
       198429
       198429
       198429
       198429
       198429
       198429
       198429
       198429
       198429
       198429
      ...
       198429
       198429
       198429
       198429
       198429
       198429
       198429
       198429
       198429
       198429
    
    
      9 
       198424
       198424
       198424
       198424
       198424
       198424
       198424
       198424
       198424
       198424
      ...
       198424
       198424
       198424
       198424
       198424
       198424
       198424
       198424
       198424
       198424
    
    
      10
       198453
       198453
       198453
       198453
       198453
       198453
       198453
       198453
       198453
       198453
      ...
       198453
       198453
       198453
       198453
       198453
       198453
       198453
       198453
       198453
       198453
    
    
      11
       198436
       198436
       198436
       198436
       198436
       198436
       198436
       198436
       198436
       198436
      ...
       198436
       198436
       198436
       198436
       198436
       198436
       198436
       198436
       198436
       198436
    
    
      12
       198366
       198366
       198366
       198366
       198366
       198366
       198366
       198366
       198366
       198366
      ...
       198366
       198366
       198366
       198366
       198366
       198366
       198366
       198366
       198366
       198366
    
    
      13
       198303
       198303
       198303
       198303
       198303
       198303
       198303
       198303
       198303
       198303
      ...
       198303
       198303
       198303
       198303
       198303
       198303
       198303
       198303
       198303
       198303
    
    
      14
       198174
       198174
       198174
       198174
       198174
       198174
       198174
       198174
       198174
       198174
      ...
       198174
       198174
       198174
       198174
       198174
       198174
       198174
       198174
       198174
       198174
    
    
      15
       198051
       198051
       198051
       198051
       198051
       198051
       198051
       198051
       198051
       198051
      ...
       198051
       198051
       198051
       198051
       198051
       198051
       198051
       198051
       198051
       198051
    
    
      16
       198167
       198167
       198167
       198167
       198167
       198167
       198167
       198167
       198167
       198167
      ...
       198167
       198167
       198167
       198167
       198167
       198167
       198167
       198167
       198167
       198167
    
    
      17
       198413
       198413
       198413
       198413
       198413
       198413
       198413
       198413
       198413
       198413
      ...
       198413
       198413
       198413
       198413
       198413
       198413
       198413
       198413
       198413
       198413
    
    
      18
       198588
       198588
       198588
       198588
       198588
       198588
       198588
       198588
       198588
       198588
      ...
       198588
       198588
       198588
       198588
       198588
       198588
       198588
       198588
       198588
       198588
    
    
      19
       198886
       198886
       198886
       198886
       198886
       198886
       198886
       198886
       198886
       198886
      ...
       198886
       198886
       198886
       198886
       198886
       198886
       198886
       198886
       198886
       198886
    
    
      20
       199038
       199038
       199038
       199038
       199038
       199038
       199038
       199038
       199038
       199038
      ...
       199038
       199038
       199038
       199038
       199038
       199038
       199038
       199038
       199038
       199038
    
    
      21
       199218
       199218
       199218
       199218
       199218
       199218
       199218
       199218
       199218
       199218
      ...
       199218
       199218
       199218
       199218
       199218
       199218
       199218
       199218
       199218
       199218
    
    
      22
       199299
       199299
       199299
       199299
       199299
       199299
       199299
       199299
       199299
       199299
      ...
       199299
       199299
       199299
       199299
       199299
       199299
       199299
       199299
       199299
       199299
    
    
      23
       179352
       179352
       179352
       179352
       179352
       179352
       179352
       179352
       179352
       179352
      ...
       179352
       179352
       179352
       179352
       179352
       179352
       179352
       179352
       179352
       179352
    
  

24 rows × 28 columns

Plot the ratio $\text{click} \over \text{click} + \text{non-click}$ over hours and days.



In [8]:

    
ratio_day = train0.groupby('day').mean()['click']
ratio_hour = train0.groupby('short_hour').mean()['click']
ratio_dayhour = train0.groupby(['day', 'short_hour']).mean()['click']



In [9]:

    
fig, (ax1, ax2, ax3) = plt.subplots(3)

ax1.plot(range(10), ratio_day)
ax1.set_ylabel('Daily')
ax2.plot(range(24), ratio_hour)
ax2.set_ylabel('hourly')
ax3.plot(range(239), ratio_dayhour)









    Out[9]:





[<matplotlib.lines.Line2D at 0x102925950>]

As one expected, 'day' and 'hour' are two significant factors.

‘banner_pos’ is a significant categorical as well.



In [10]:

    
train0.groupby('banner_pos').mean()









    Out[10]:






  
    
      
      click
      hour
      C1
      device_type
      device_conn_type
      C17
      C18
      C19
      C20
      C21
      C22
      C23
      C24

      day
      short_hour
    
    
      banner_pos
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
       0.162781
       14100552.730824
       1004.683912
       0.894253
       0.299853
       13416.939736
       317.548877
        65.690387
       1472.240121
       2.151704
       153.987287
       49710.995388
        49.029867
       5.413651
       11.365757
    
    
      1
       0.191973
       14100572.658660
       1005.401300
       1.249220
       0.128965
       15073.450400
       319.760052
        49.968080
       1668.680682
       1.952944
       207.994925
       50256.407313
        49.370588
       5.610416
       11.617018
    
    
      2
       0.083807
       14100576.455966
       1005.000000
       1.000000
       0.031250
       13653.852983
       319.261364
        49.900568
       1479.737926
       2.117188
        84.552557
       55450.803267
        52.546875
       5.653409
       11.115057
    
    
      3
       0.000000
       14100421.657143
       1005.500000
       1.000000
       0.000000
       16323.114286
       306.571429
       184.285714
       1821.557143
       2.314286
       229.685714
       65834.371429
        50.700000
       4.128571
        8.800000
    
    
      4
       0.100746
       14100985.533582
       1005.000000
       1.000000
       0.022388
       16301.906716
       320.000000
        50.000000
       1796.585821
       0.638060
       349.485075
       73939.675373
        97.283582
       9.731343
       12.399254
    
    
      5
       0.133152
       14100650.722826
       1008.000000
       1.000000
       1.633152
       17274.442935
       320.000000
        50.000000
       1953.513587
       2.752717
       396.380435
       71856.146739
       147.649457
       6.413043
        9.418478
    
    
      7
       0.117925
       14100483.308962
       1010.000000
       4.063679
       0.943396
       16990.594340
       330.188679
       481.509434
       1913.379717
       1.818396
       192.594340
       84347.497642
       117.806604
       4.714623
       11.846698

A closer look at the categorical vaiables.



In [11]:

    
intcols = ("click","day","short_hour","C1","C17","C18","C19","C20","C21","C22","C23","C24")
catcols = ("banner_pos","site_id","site_domain","site_category","app_id","app_domain","app_category","device_os","device_make","device_model","device_type","device_conn_type")



In [12]:

    
for col_id in catcols:
    print col_id.ljust(15) + 'has {0} unique values'.format(np.unique(train0[col_id]).shape[0])









    



banner_pos     has 7 unique values
site_id        has 3555 unique values
site_domain    has 4454 unique values
site_category  has 25 unique values
app_id         has 4994 unique values
app_domain     has 298 unique values
app_category   has 34 unique values
device_os      has 12 unique values
device_make    has 262 unique values
device_model   has 6713 unique values
device_type    has 4 unique values
device_conn_typehas 4 unique values



In [16]:

    
train0.iloc[0]









    Out[16]:





id                    5649318912099309032
click                                   0
hour                             14100100
C1                                   1005
banner_pos                              0
site_id                          d41d8cd9
site_domain                      d41d8cd9
site_category                    d41d8cd9
app_id                           3e123914
app_domain                       85262c2b
app_category                     db4fcb5b
device_id                        33ee02c6
device_ip                        e191e2c1
device_os                        c31b3236
device_make                      fe546279
device_model                     b44b5e7b
device_type                             1
device_conn_type                        2
device_geo_country               00c66f1a
C17                                 16692
C18                                   320
C19                                    50
C20                                   538
C21                                     3
C22                                    39
C23                                    -1
C24\n                                  23
day                                     1
short_hour                              0
Name: 0, dtype: object



In [30]:

    
chacols = ("site_id","site_domain","site_category","app_id","app_domain","app_category","device_os","device_make","device_model")



In [31]:

    
np.array(train0[list(chacols)])









    Out[31]:





array([['d41d8cd9', 'd41d8cd9', 'd41d8cd9', ..., 'c31b3236', 'fe546279',
        'b44b5e7b'],
       ['a3cfee0f', '0ab81953', 'a89b3dd2', ..., 'c31b3236', 'e389a212',
        '7240a5d2'],
       ['5fa09cb7', '8704e43a', '7e5068fc', ..., 'c31b3236', '0c23a8bf',
        'a7a4ccc5'],
       ..., 
       ['d41d8cd9', 'd41d8cd9', 'd41d8cd9', ..., 'c31b3236', '75e9b18f',
        'e3a5b98e'],
       ['09ab1430', 'a56a5285', '7e5068fc', ..., 'c31b3236', 'fe546279',
        'b4db307f'],
       ['d22451da', '7c429b8a', '7e5068fc', ..., 'c31b3236', 'fe546279',
        'd6f93594']], dtype=object)



In [32]:

    
from sklearn.feature_extraction import FeatureHasher
fh = FeatureHasher(input_type='string')
# generaet a sparse matrix
csr_fh = fh.fit_transform(np.array(train0[list(chacols)].astype(str)))



In [35]:

    
print csr_fh.shape
print csr_fh.data.shape
print csr_fh.indices.shape
print csr_fh.indptr.shape









    



(4748679, 1048576)
(32562430,)
(32562430,)
(4748680,)



In [36]:

    
test = pd.read_csv('data/test_rev2.csv')

fh.transform(np.array(test[list(catcols)].astype(str)))









    



/Users/leigong/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/pandas-0.14.1-py2.7-macosx-10.6-x86_64.egg/pandas/io/parsers.py:1139: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-36-3a88b0141ee2> in <module>()
      1 test = pd.read_csv('data/test_rev2.csv')
      2 
----> 3 fh.transform(np.array(test[list(catcols)].astype(str)))

/Users/leigong/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/pandas-0.14.1-py2.7-macosx-10.6-x86_64.egg/pandas/core/generic.pyc in astype(self, dtype, copy, raise_on_error)
   2096 
   2097         mgr = self._data.astype(
-> 2098             dtype=dtype, copy=copy, raise_on_error=raise_on_error)
   2099         return self._constructor(mgr).__finalize__(self)
   2100 

/Users/leigong/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/pandas-0.14.1-py2.7-macosx-10.6-x86_64.egg/pandas/core/internals.pyc in astype(self, dtype, **kwargs)
   2235 
   2236     def astype(self, dtype, **kwargs):
-> 2237         return self.apply('astype', dtype=dtype, **kwargs)
   2238 
   2239     def convert(self, **kwargs):

/Users/leigong/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/pandas-0.14.1-py2.7-macosx-10.6-x86_64.egg/pandas/core/internals.pyc in apply(self, f, axes, filter, do_integrity_check, **kwargs)
   2190                                                  copy=align_copy)
   2191 
-> 2192             applied = getattr(b, f)(**kwargs)
   2193 
   2194             if isinstance(applied, list):

/Users/leigong/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/pandas-0.14.1-py2.7-macosx-10.6-x86_64.egg/pandas/core/internals.pyc in astype(self, dtype, copy, raise_on_error, values)
    319     def astype(self, dtype, copy=False, raise_on_error=True, values=None):
    320         return self._astype(dtype, copy=copy, raise_on_error=raise_on_error,
--> 321                             values=values)
    322 
    323     def _astype(self, dtype, copy=False, raise_on_error=True, values=None,

/Users/leigong/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/pandas-0.14.1-py2.7-macosx-10.6-x86_64.egg/pandas/core/internals.pyc in _astype(self, dtype, copy, raise_on_error, values, klass)
    337             if values is None:
    338                 # _astype_nansafe works fine with 1-d only
--> 339                 values = com._astype_nansafe(self.values.ravel(), dtype, copy=True)
    340                 values = values.reshape(self.values.shape)
    341             newb = make_block(values,

/Users/leigong/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/pandas-0.14.1-py2.7-macosx-10.6-x86_64.egg/pandas/core/common.pyc in _astype_nansafe(arr, dtype, copy)
   2412         return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
   2413     elif issubclass(dtype.type, compat.string_types):
-> 2414         return lib.astype_str(arr.ravel()).reshape(arr.shape)
   2415 
   2416     if copy:

/Users/leigong/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/pandas-0.14.1-py2.7-macosx-10.6-x86_64.egg/pandas/lib.so in pandas.lib.astype_str (pandas/lib.c:13692)()

/Users/leigong/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/pandas-0.14.1-py2.7-macosx-10.6-x86_64.egg/pandas/lib.so in pandas.lib.astype_str (pandas/lib.c:13614)()

/Users/leigong/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/numpy/core/numeric.pyc in array_str(a, max_line_width, precision, suppress_small)
   1613 
   1614     """
-> 1615     return array2string(a, max_line_width, precision, suppress_small, ' ', "", str)
   1616 
   1617 def set_string_function(f, repr=True):

/Users/leigong/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/numpy/core/arrayprint.pyc in array2string(a, max_line_width, precision, suppress_small, separator, prefix, style, formatter)
    435 
    436     if a.shape == ():
--> 437         x = a.item()
    438         try:
    439             lst = a._format(x)

KeyboardInterrupt:



In [51]:

    
from scipy.sparse import csr_matrix

# row oriented
indptr = np.array([0,2,3,6])
indices = np.array([0,2,2,0,1,2])
data = np.array([1,2,3,4,5,6])

csr_matrix( (data,indices,indptr), shape=(3,3) ).todense()









    Out[51]:





matrix([[1, 0, 2],
        [0, 0, 3],
        [4, 5, 6]])



In [50]:

    
# non-zero values of the i-th row
print data[indptr[0]:indptr[0+1]]
print data[indptr[1]:indptr[1+1]]
print data[indptr[2]:indptr[2+1]]









    



[1 2]
[3]
[4 5 6]



In [52]:

    
# non-zero positions of the i-th row
print indices[indptr[0]:indptr[0+1]]
print indices[indptr[1]:indptr[1+1]]
print indices[indptr[2]:indptr[2+1]]









    



[0 2]
[2]
[0 1 2]



In [ ]:

	id	click	hour	C1	banner_pos	site_id	site_domain	site_category	app_id	app_domain	...	device_conn_type	device_geo_country	C17	C18	C19	C20	C21	C22	C23	C24
0	10000222510487979663	0	14100100	1005	0	d41d8cd9	d41d8cd9	d41d8cd9	ee72efa5	85262c2b	...	0	fc9fdf08	11999	320	50	1248	2	39	-1	13
1	10000335031004381249	0	14100100	1005	0	d41d8cd9	d41d8cd9	d41d8cd9	7ddd1e29	85262c2b	...	0	e22428cc	12026	320	50	1248	2	39	-1	13
2	10000413097548171036	0	14100100	1010	1	d41d8cd9	d41d8cd9	d41d8cd9	7dd0bcc4	d41d8cd9	...	2	5343b21a	5470	320	50	394	2	303	-1	15
3	10000436876114817886	0	14100100	1002	0	d5589b4a	d41d8cd9	d41d8cd9	d41d8cd9	d41d8cd9	...	0	0b3b97fa	16723	320	50	1876	2	291	-1	33
4	10000488446663934007	1	14100100	1005	0	d41d8cd9	d41d8cd9	d41d8cd9	aa55fc10	85262c2b	...	2	75778bf8	17012	320	50	1871	3	35	100053	23
5	10000501900095337384	0	14100100	1005	0	d41d8cd9	d41d8cd9	d41d8cd9	759472d5	2029219a	...	0	959848ca	14903	320	50	1622	3	175	100156	42
6	10000531960943040822	0	14100100	1005	0	d41d8cd9	d41d8cd9	d41d8cd9	ef6e6261	85262c2b	...	0	afbe94cd	16688	320	50	1873	3	35	-1	23
7	10000832723983198840	0	14100100	1005	0	d41d8cd9	d41d8cd9	d41d8cd9	96b8c96a	4f04e5f8	...	0	0b3b97fa	16858	320	50	1887	3	39	-1	23
8	10000891345123252859	0	14100100	1005	1	de28ac8c	034212b6	7e5068fc	d41d8cd9	d41d8cd9	...	0	13b5bfe9	16208	320	50	1800	3	47	-1	23
9	10000928132460538981	0	14100100	1005	1	09ab1430	a56a5285	7e5068fc	d41d8cd9	d41d8cd9	...	0	c12e01f2	17037	320	50	1934	2	35	-1	16

	id	click	C1	banner_pos	site_id	site_domain	site_category	app_id	app_domain	app_category	...	device_conn_type	device_geo_country	C17	C18	C19	C20	C21	C22	C23	C24
hour
14100100	199229	199229	199229	199229	199229	199229	199229	199229	199229	199229	...	199229	199229	199229	199229	199229	199229	199229	199229	199229	199229
14100101	199025	199025	199025	199025	199025	199025	199025	199025	199025	199025	...	199025	199025	199025	199025	199025	199025	199025	199025	199025	199025
14100102	198692	198692	198692	198692	198692	198692	198692	198692	198692	198692	...	198692	198692	198692	198692	198692	198692	198692	198692	198692	198692
14100103	3053	3053	3053	3053	3053	3053	3053	3053	3053	3053	...	3053	3053	3053	3053	3053	3053	3053	3053	3053	3053

	id	hour	C1	banner_pos	site_id	site_domain	site_category	app_id	app_domain	...	C17	C18	C19	C20	C21	C22	C23	C24	day
0	5649318912099309032	14100100	1005	0	d41d8cd9	d41d8cd9	d41d8cd9	3e123914	85262c2b	...	16692	320	50	538	3	39	-1	23	1
1	5556718111690710424	14100100	1005	1	a3cfee0f	0ab81953	a89b3dd2	d41d8cd9	d41d8cd9	...	487	320	50	110	2	35	-1	16	1
2	9940524301330922678	14100100	1005	0	5fa09cb7	8704e43a	7e5068fc	d41d8cd9	d41d8cd9	...	16739	320	50	1877	2	291	-1	33	1
3	5633074668823491084	14100100	1005	0	d41d8cd9	d41d8cd9	d41d8cd9	540ffd21	d41d8cd9	...	17035	300	250	1699	0	35	100077	110	1
4	13542629736491476466	14100100	1005	1	09ab1430	a56a5285	7e5068fc	d41d8cd9	d41d8cd9	...	16959	320	50	1916	2	39	100084	117	1

	id	click	hour	C1	banner_pos	site_id	site_domain	site_category	app_id	app_domain	...	device_geo_country	C17	C18	C19	C20	C21	C22	C23	C24	short_hour
day
1	476194	476194	476194	476194	476194	476194	476194	476194	476194	476194	...	476194	476194	476194	476194	476194	476194	476194	476194	476194	476194
2	476529	476529	476529	476529	476529	476529	476529	476529	476529	476529	...	476529	476529	476529	476529	476529	476529	476529	476529	476529	476529
3	476803	476803	476803	476803	476803	476803	476803	476803	476803	476803	...	476803	476803	476803	476803	476803	476803	476803	476803	476803	476803
4	477649	477649	477649	477649	477649	477649	477649	477649	477649	477649	...	477649	477649	477649	477649	477649	477649	477649	477649	477649	477649
5	477638	477638	477638	477638	477638	477638	477638	477638	477638	477638	...	477638	477638	477638	477638	477638	477638	477638	477638	477638	477638
6	476983	476983	476983	476983	476983	476983	476983	476983	476983	476983	...	476983	476983	476983	476983	476983	476983	476983	476983	476983	476983
7	476922	476922	476922	476922	476922	476922	476922	476922	476922	476922	...	476922	476922	476922	476922	476922	476922	476922	476922	476922	476922
8	476613	476613	476613	476613	476613	476613	476613	476613	476613	476613	...	476613	476613	476613	476613	476613	476613	476613	476613	476613	476613
9	476679	476679	476679	476679	476679	476679	476679	476679	476679	476679	...	476679	476679	476679	476679	476679	476679	476679	476679	476679	476679
10	456669	456669	456669	456669	456669	456669	456669	456669	456669	456669	...	456669	456669	456669	456669	456669	456669	456669	456669	456669	456669

	id	click	hour	C1	banner_pos	site_id	site_domain	site_category	app_id	app_domain	...	device_geo_country	C17	C18	C19	C20	C21	C22	C23	C24	day
short_hour
0	199255	199255	199255	199255	199255	199255	199255	199255	199255	199255	...	199255	199255	199255	199255	199255	199255	199255	199255	199255	199255
1	199139	199139	199139	199139	199139	199139	199139	199139	199139	199139	...	199139	199139	199139	199139	199139	199139	199139	199139	199139	199139
2	199104	199104	199104	199104	199104	199104	199104	199104	199104	199104	...	199104	199104	199104	199104	199104	199104	199104	199104	199104	199104
3	199005	199005	199005	199005	199005	199005	199005	199005	199005	199005	...	199005	199005	199005	199005	199005	199005	199005	199005	199005	199005
4	198799	198799	198799	198799	198799	198799	198799	198799	198799	198799	...	198799	198799	198799	198799	198799	198799	198799	198799	198799	198799
5	198669	198669	198669	198669	198669	198669	198669	198669	198669	198669	...	198669	198669	198669	198669	198669	198669	198669	198669	198669	198669
6	198564	198564	198564	198564	198564	198564	198564	198564	198564	198564	...	198564	198564	198564	198564	198564	198564	198564	198564	198564	198564
7	198547	198547	198547	198547	198547	198547	198547	198547	198547	198547	...	198547	198547	198547	198547	198547	198547	198547	198547	198547	198547
8	198429	198429	198429	198429	198429	198429	198429	198429	198429	198429	...	198429	198429	198429	198429	198429	198429	198429	198429	198429	198429
9	198424	198424	198424	198424	198424	198424	198424	198424	198424	198424	...	198424	198424	198424	198424	198424	198424	198424	198424	198424	198424
10	198453	198453	198453	198453	198453	198453	198453	198453	198453	198453	...	198453	198453	198453	198453	198453	198453	198453	198453	198453	198453
11	198436	198436	198436	198436	198436	198436	198436	198436	198436	198436	...	198436	198436	198436	198436	198436	198436	198436	198436	198436	198436
12	198366	198366	198366	198366	198366	198366	198366	198366	198366	198366	...	198366	198366	198366	198366	198366	198366	198366	198366	198366	198366
13	198303	198303	198303	198303	198303	198303	198303	198303	198303	198303	...	198303	198303	198303	198303	198303	198303	198303	198303	198303	198303
14	198174	198174	198174	198174	198174	198174	198174	198174	198174	198174	...	198174	198174	198174	198174	198174	198174	198174	198174	198174	198174
15	198051	198051	198051	198051	198051	198051	198051	198051	198051	198051	...	198051	198051	198051	198051	198051	198051	198051	198051	198051	198051
16	198167	198167	198167	198167	198167	198167	198167	198167	198167	198167	...	198167	198167	198167	198167	198167	198167	198167	198167	198167	198167
17	198413	198413	198413	198413	198413	198413	198413	198413	198413	198413	...	198413	198413	198413	198413	198413	198413	198413	198413	198413	198413
18	198588	198588	198588	198588	198588	198588	198588	198588	198588	198588	...	198588	198588	198588	198588	198588	198588	198588	198588	198588	198588
19	198886	198886	198886	198886	198886	198886	198886	198886	198886	198886	...	198886	198886	198886	198886	198886	198886	198886	198886	198886	198886
20	199038	199038	199038	199038	199038	199038	199038	199038	199038	199038	...	199038	199038	199038	199038	199038	199038	199038	199038	199038	199038
21	199218	199218	199218	199218	199218	199218	199218	199218	199218	199218	...	199218	199218	199218	199218	199218	199218	199218	199218	199218	199218
22	199299	199299	199299	199299	199299	199299	199299	199299	199299	199299	...	199299	199299	199299	199299	199299	199299	199299	199299	199299	199299
23	179352	179352	179352	179352	179352	179352	179352	179352	179352	179352	...	179352	179352	179352	179352	179352	179352	179352	179352	179352	179352

	click	hour	C1	device_type	device_conn_type	C17	C18	C19	C20	C21	C22	C23	C24	day	short_hour
banner_pos
0	0.162781	14100552.730824	1004.683912	0.894253	0.299853	13416.939736	317.548877	65.690387	1472.240121	2.151704	153.987287	49710.995388	49.029867	5.413651	11.365757
1	0.191973	14100572.658660	1005.401300	1.249220	0.128965	15073.450400	319.760052	49.968080	1668.680682	1.952944	207.994925	50256.407313	49.370588	5.610416	11.617018
2	0.083807	14100576.455966	1005.000000	1.000000	0.031250	13653.852983	319.261364	49.900568	1479.737926	2.117188	84.552557	55450.803267	52.546875	5.653409	11.115057
3	0.000000	14100421.657143	1005.500000	1.000000	0.000000	16323.114286	306.571429	184.285714	1821.557143	2.314286	229.685714	65834.371429	50.700000	4.128571	8.800000
4	0.100746	14100985.533582	1005.000000	1.000000	0.022388	16301.906716	320.000000	50.000000	1796.585821	0.638060	349.485075	73939.675373	97.283582	9.731343	12.399254
5	0.133152	14100650.722826	1008.000000	1.000000	1.633152	17274.442935	320.000000	50.000000	1953.513587	2.752717	396.380435	71856.146739	147.649457	6.413043	9.418478
7	0.117925	14100483.308962	1010.000000	4.063679	0.943396	16990.594340	330.188679	481.509434	1913.379717	1.818396	192.594340	84347.497642	117.806604	4.714623	11.846698