notebook.community

Edit and run



In [7]:

    
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline



In [ ]:



In [8]:

    
df = pd.read_csv('dataset_1mb.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df['Outcome'] = df['Outcome'].replace('c', 1).replace('w', 0)
df = df[(df['Gender'] != '0') & (df['Age'] != 0)]
df = df[df['OS'] != '0']
df.to_csv('data.csv', index=False)
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
df[:2]









    Out[8]:






  
    
      
      BidId
      TrafficType
      PublisherId
      AppSiteId
      AppSiteCategory
      Position
      BidFloor
      Timestamp
      Age
      Gender
      ...
      Latitude
      Longitude
      Zipcode
      GeoType
      CampaignId
      CreativeId
      CreativeType
      CreativeCategory
      ExchangeBid
      Outcome
    
  
  
    
      3
      7788eea7b165049bd9a2dcdffdc0e130d27bad9c
      app
      9f9e722a5209569959fc30193020ebffa5d0f13e
      f3f447abf6aa15def97ddbcc0f8694eb8d8a4839
      Arts & Entertainment
      Above the Fold
      0
      2015-10-28 00:21:41
      22
      M
      ...
      14.6
      121
      0
      IP
      69463
      867750
      2
      Television
      0.02
      0
    
    
      5
      6644f74714787585149e10aa3d66750d7f1bdd59
      app
      1ab335267bb1f1691652228eddff9ccf32ecd096
      9e2f564ea8403f0e8a36366bcc4d29b2b86bd299
      Society
      Above the Fold
      0
      2015-10-28 00:19:45
      14
      M
      ...
      41.3
      -74
      10980
      IP
      54351
      389378
      2
      Hotels#Advertising
      0.22
      0
    
  

2 rows × 29 columns



In [9]:

    
df.describe()









    Out[9]:






  
    
      
      BidFloor
      Age
      Latitude
      Longitude
      Zipcode
      CampaignId
      CreativeId
      CreativeType
      ExchangeBid
    
  
  
    
      count
      807
      807.000000
      807.000000
      807.000000
      807.000000
      807.000000
      807.000000
      807.000000
      807.000000
    
    
      mean
      0
      26.086741
      25.568897
      -34.450434
      24657.719950
      62031.272615
      818108.287485
      1.961586
      0.084895
    
    
      std
      0
      11.223752
      19.851134
      91.714190
      35200.613346
      4756.569081
      105485.014961
      0.192312
      0.195001
    
    
      min
      0
      12.000000
      -80.200000
      -149.900000
      0.000000
      54351.000000
      389378.000000
      1.000000
      0.000000
    
    
      25%
      0
      19.000000
      1.450000
      -96.800000
      0.000000
      57139.000000
      831042.000000
      2.000000
      0.020000
    
    
      50%
      0
      23.000000
      34.100000
      -83.000000
      0.000000
      61863.000000
      835106.000000
      2.000000
      0.060000
    
    
      75%
      0
      30.000000
      40.100000
      106.800000
      45955.500000
      66591.000000
      866164.000000
      2.000000
      0.070000
    
    
      max
      0
      106.000000
      61.200000
      153.000000
      350003.000000
      71007.000000
      872738.000000
      2.000000
      3.330000



In [10]:

    
readables = ['AppSiteCategory', 'Position', 
             'Age', 'Gender',
             'DeviceType', 
            'CreativeCategory']

# Country?
# OS

countries = df['Country'].unique()
# Outcome
# TrafficType
# Model 
# Carrier 
# GeoType
# Manufacturer



In [11]:

    
for item in readables:
    print(item + ": \n",df[item].value_counts()[:5],"\n"+"="*15)









    



AppSiteCategory: 
 Arts & Entertainment                           429
Society                                        280
Technology & Computing#Arts & Entertainment     47
Technology & Computing                          44
Arts & Entertainment#Society                     3
Name: AppSiteCategory, dtype: int64 
===============
Position: 
 Above the Fold    691
Footer             91
Unknown            21
Header              3
Below the Fold      1
Name: Position, dtype: int64 
===============
Age: 
 19    85
20    68
21    41
18    40
22    37
Name: Age, dtype: int64 
===============
Gender: 
 M    476
F    331
Name: Gender, dtype: int64 
===============
DeviceType: 
 Mobile/Tablet    387
Phone            357
Tablet            61
0                  2
Name: DeviceType, dtype: int64 
===============
CreativeCategory: 
 Financial Planning#Investing    303
Air Travel#Business Travel      207
Advertising#Comparison          113
Hotels#Advertising               45
Advertising#Cell Phones          40
Name: CreativeCategory, dtype: int64 
===============



In [12]:

    
df_agg = df.set_index(readables)
df_agg = df_agg.swaplevel(0,1).sortlevel(0)
df_agg = df_agg.swaplevel(0,1).sortlevel(0)
df_agg[:2]









    Out[12]:






  
    
      
      
      
      
      
      
      BidId
      TrafficType
      PublisherId
      AppSiteId
      BidFloor
      Timestamp
      OS
      OSVersion
      Model
      Manufacturer
      ...
      Country
      Latitude
      Longitude
      Zipcode
      GeoType
      CampaignId
      CreativeId
      CreativeType
      ExchangeBid
      Outcome
    
    
      AppSiteCategory
      Position
      Age
      Gender
      DeviceType
      CreativeCategory
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      Arts & Entertainment
      Above the Fold
      12
      F
      Mobile/Tablet
      Advertising#Cell Phones
      e81deba4db04e78a6c9d7cdfc10edbc6b90873c5
      app
      7ba4d649042a1a655f3a3ba02267fa1ab66d17e7
      660ec2e1724b3b9b7804c0258970ea26a6d7a4de
      0
      2015-10-28 00:51:48
      Android
      5
      0
      Samsung
      ...
      IDN
      -6.2
      106.8
      0
      User
      69891
      868690
      2
      0.18
      0
    
    
      Air Travel#Business Travel
      01e57be3a57b4bd48d5d9d94e3e064612ba103db
      app
      7ba4d649042a1a655f3a3ba02267fa1ab66d17e7
      a9fe9aff8929976f110b27a8aef236516adaa6d6
      0
      2015-10-28 00:44:50
      Android
      5
      Samsung SM-G900P
      Samsung
      ...
      USA
      32.8
      -97.1
      0
      GPS
      57139
      835106
      2
      0.02
      0
    
  

2 rows × 23 columns



In [13]:

    
df_corr = df_agg.groupby(level=[7,3,1]).mean()['ExchangeBid']
df_corr.plot









    



---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-13-9f7d5671d168> in <module>()
----> 1 df_corr = df_agg.groupby(level=[7,3,1]).mean()['ExchangeBid']
      2 df_corr.plot

/usr/lib64/python3.4/site-packages/pandas/core/generic.py in groupby(self, by, axis, level, as_index, sort, group_keys, squeeze)
   3322         axis = self._get_axis_number(axis)
   3323         return groupby(self, by=by, axis=axis, level=level, as_index=as_index,
-> 3324                        sort=sort, group_keys=group_keys, squeeze=squeeze)
   3325 
   3326     def asfreq(self, freq, method=None, how=None, normalize=False):

/usr/lib64/python3.4/site-packages/pandas/core/groupby.py in groupby(obj, by, **kwds)
   1250         raise TypeError('invalid type: %s' % type(obj))
   1251 
-> 1252     return klass(obj, by, **kwds)
   1253 
   1254 

/usr/lib64/python3.4/site-packages/pandas/core/groupby.py in __init__(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, squeeze)
    414         if grouper is None:
    415             grouper, exclusions, obj = _get_grouper(obj, keys, axis=axis,
--> 416                                                     level=level, sort=sort)
    417 
    418         self.obj = obj

/usr/lib64/python3.4/site-packages/pandas/core/groupby.py in _get_grouper(obj, key, axis, level, sort)
   2174 
   2175         ping = Grouping(group_axis, gpr, obj=obj, name=name,
-> 2176                         level=level, sort=sort, in_axis=in_axis)
   2177 
   2178         groupings.append(ping)

/usr/lib64/python3.4/site-packages/pandas/core/groupby.py in __init__(self, index, grouper, obj, name, level, sort, in_axis)
   1920                 level = index.names.index(level)
   1921 
-> 1922             inds = index.labels[level]
   1923             level_index = index.levels[level]
   1924 

/usr/lib64/python3.4/site-packages/pandas/core/base.py in __getitem__(self, n)
    220         if isinstance(n, slice):
    221             return self.__class__(super(FrozenList, self).__getitem__(n))
--> 222         return super(FrozenList, self).__getitem__(n)
    223 
    224     def __radd__(self, other):

IndexError: list index out of range

Correlation b/w bid winnings and creative category



In [ ]:

    
# #df_corr.reset_index()
# df_corr = df.set_index(['Country', 'AppSiteCategory', 'Gender', 'Age']).sortlevel(0)
# df_corr#['ExchangeBid']#.groupby(level=[0,1,2,3]).mean()
df_country_bids = df[['Country', 'CreativeCategory', 'ExchangeBid']].sort_values(by='Country').set_index(['Country', 'CreativeCategory'])
df_con_corr = df_country_bids.groupby(level=1).mean().sort_values(by='ExchangeBid')
df_con_corr.plot(figsize=(16,8), kind='bar', linewidth=2)



In [ ]:

    
# df.corr(method='pearson') #kendall



In [ ]:

    
df.set_index('Timestamp').resample('5T', how={'ExchangeBid': pd.np.sum, 'CreativeCategory': 'count'}).plot(figsize=(16,8), kind='bar', linewidth=2)



In [ ]:

    
sizes = {}
for country in countries: 
    print(country, df_country_bids.loc[country].size)
    # df_country_bids.loc[country]['Count'] = df_country_bids.loc[country].size
    sizes[country] = df_country_bids.loc[country].size



In [ ]:

    
sizes



In [ ]:

    
df_country_bids



In [ ]:

    
df.set_index('Timestamp')[['ExchangeBid']].resample('5T', how='sum')



In [ ]:

    
# pd.Series(["a","b","c","a"], dtype="category")
df_agg#.swaplevel(0,9).sortlevel(0)
# #df_country = df_country#.swaplevel(0,9)#replace(0, pd.np.nan)
# # USA
# df_usa = df_country.swaplevel(0,8).loc['USA'].sortlevel(level=[0,5,8])
# # df_country
# df_usa = df_usa.reset_index(['Age', 'Gender'])



In [ ]:

    
df_coun



In [ ]:

	BidId	TrafficType	PublisherId	AppSiteId	AppSiteCategory	Position	BidFloor	Timestamp	Age	Gender	...	Latitude	Longitude	Zipcode	GeoType	CampaignId	CreativeId	CreativeType	CreativeCategory	ExchangeBid	Outcome
3	7788eea7b165049bd9a2dcdffdc0e130d27bad9c	app	9f9e722a5209569959fc30193020ebffa5d0f13e	f3f447abf6aa15def97ddbcc0f8694eb8d8a4839	Arts & Entertainment	Above the Fold	0	2015-10-28 00:21:41	22	M	...	14.6	121	0	IP	69463	867750	2	Television	0.02	0
5	6644f74714787585149e10aa3d66750d7f1bdd59	app	1ab335267bb1f1691652228eddff9ccf32ecd096	9e2f564ea8403f0e8a36366bcc4d29b2b86bd299	Society	Above the Fold	0	2015-10-28 00:19:45	14	M	...	41.3	-74	10980	IP	54351	389378	2	Hotels#Advertising	0.22	0

	BidFloor	Age	Latitude	Longitude	Zipcode	CampaignId	CreativeId	CreativeType	ExchangeBid
count	807	807.000000	807.000000	807.000000	807.000000	807.000000	807.000000	807.000000	807.000000
mean	0	26.086741	25.568897	-34.450434	24657.719950	62031.272615	818108.287485	1.961586	0.084895
std	0	11.223752	19.851134	91.714190	35200.613346	4756.569081	105485.014961	0.192312	0.195001
min	0	12.000000	-80.200000	-149.900000	0.000000	54351.000000	389378.000000	1.000000	0.000000
25%	0	19.000000	1.450000	-96.800000	0.000000	57139.000000	831042.000000	2.000000	0.020000
50%	0	23.000000	34.100000	-83.000000	0.000000	61863.000000	835106.000000	2.000000	0.060000
75%	0	30.000000	40.100000	106.800000	45955.500000	66591.000000	866164.000000	2.000000	0.070000
max	0	106.000000	61.200000	153.000000	350003.000000	71007.000000	872738.000000	2.000000	3.330000

						BidId	TrafficType	PublisherId	AppSiteId	BidFloor	Timestamp	OS	OSVersion	Model	Manufacturer	...	Country	Latitude	Longitude	Zipcode	GeoType	CampaignId	CreativeId	CreativeType	ExchangeBid	Outcome
AppSiteCategory	Position	Age	Gender	DeviceType	CreativeCategory
Arts & Entertainment	Above the Fold	12	F	Mobile/Tablet	Advertising#Cell Phones	e81deba4db04e78a6c9d7cdfc10edbc6b90873c5	app	7ba4d649042a1a655f3a3ba02267fa1ab66d17e7	660ec2e1724b3b9b7804c0258970ea26a6d7a4de	0	2015-10-28 00:51:48	Android	5	0	Samsung	...	IDN	-6.2	106.8	0	User	69891	868690	2	0.18	0
Arts & Entertainment	Above the Fold	12	F	Mobile/Tablet	Air Travel#Business Travel	01e57be3a57b4bd48d5d9d94e3e064612ba103db	app	7ba4d649042a1a655f3a3ba02267fa1ab66d17e7	a9fe9aff8929976f110b27a8aef236516adaa6d6	0	2015-10-28 00:44:50	Android	5	Samsung SM-G900P	Samsung	...	USA	32.8	-97.1	0	GPS	57139	835106	2	0.02	0