In [7]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [ ]:


In [8]:
df = pd.read_csv('dataset_1mb.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df['Outcome'] = df['Outcome'].replace('c', 1).replace('w', 0)
df = df[(df['Gender'] != '0') & (df['Age'] != 0)]
df = df[df['OS'] != '0']
df.to_csv('data.csv', index=False)
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
df[:2]


Out[8]:
BidId TrafficType PublisherId AppSiteId AppSiteCategory Position BidFloor Timestamp Age Gender ... Latitude Longitude Zipcode GeoType CampaignId CreativeId CreativeType CreativeCategory ExchangeBid Outcome
3 7788eea7b165049bd9a2dcdffdc0e130d27bad9c app 9f9e722a5209569959fc30193020ebffa5d0f13e f3f447abf6aa15def97ddbcc0f8694eb8d8a4839 Arts & Entertainment Above the Fold 0 2015-10-28 00:21:41 22 M ... 14.6 121 0 IP 69463 867750 2 Television 0.02 0
5 6644f74714787585149e10aa3d66750d7f1bdd59 app 1ab335267bb1f1691652228eddff9ccf32ecd096 9e2f564ea8403f0e8a36366bcc4d29b2b86bd299 Society Above the Fold 0 2015-10-28 00:19:45 14 M ... 41.3 -74 10980 IP 54351 389378 2 Hotels#Advertising 0.22 0

2 rows × 29 columns


In [9]:
df.describe()


Out[9]:
BidFloor Age Latitude Longitude Zipcode CampaignId CreativeId CreativeType ExchangeBid
count 807 807.000000 807.000000 807.000000 807.000000 807.000000 807.000000 807.000000 807.000000
mean 0 26.086741 25.568897 -34.450434 24657.719950 62031.272615 818108.287485 1.961586 0.084895
std 0 11.223752 19.851134 91.714190 35200.613346 4756.569081 105485.014961 0.192312 0.195001
min 0 12.000000 -80.200000 -149.900000 0.000000 54351.000000 389378.000000 1.000000 0.000000
25% 0 19.000000 1.450000 -96.800000 0.000000 57139.000000 831042.000000 2.000000 0.020000
50% 0 23.000000 34.100000 -83.000000 0.000000 61863.000000 835106.000000 2.000000 0.060000
75% 0 30.000000 40.100000 106.800000 45955.500000 66591.000000 866164.000000 2.000000 0.070000
max 0 106.000000 61.200000 153.000000 350003.000000 71007.000000 872738.000000 2.000000 3.330000

In [10]:
readables = ['AppSiteCategory', 'Position', 
             'Age', 'Gender',
             'DeviceType', 
            'CreativeCategory']

# Country?
# OS

countries = df['Country'].unique()
# Outcome
# TrafficType
# Model 
# Carrier 
# GeoType
# Manufacturer

In [11]:
for item in readables:
    print(item + ": \n",df[item].value_counts()[:5],"\n"+"="*15)


AppSiteCategory: 
 Arts & Entertainment                           429
Society                                        280
Technology & Computing#Arts & Entertainment     47
Technology & Computing                          44
Arts & Entertainment#Society                     3
Name: AppSiteCategory, dtype: int64 
===============
Position: 
 Above the Fold    691
Footer             91
Unknown            21
Header              3
Below the Fold      1
Name: Position, dtype: int64 
===============
Age: 
 19    85
20    68
21    41
18    40
22    37
Name: Age, dtype: int64 
===============
Gender: 
 M    476
F    331
Name: Gender, dtype: int64 
===============
DeviceType: 
 Mobile/Tablet    387
Phone            357
Tablet            61
0                  2
Name: DeviceType, dtype: int64 
===============
CreativeCategory: 
 Financial Planning#Investing    303
Air Travel#Business Travel      207
Advertising#Comparison          113
Hotels#Advertising               45
Advertising#Cell Phones          40
Name: CreativeCategory, dtype: int64 
===============

In [12]:
df_agg = df.set_index(readables)
df_agg = df_agg.swaplevel(0,1).sortlevel(0)
df_agg = df_agg.swaplevel(0,1).sortlevel(0)
df_agg[:2]


Out[12]:
BidId TrafficType PublisherId AppSiteId BidFloor Timestamp OS OSVersion Model Manufacturer ... Country Latitude Longitude Zipcode GeoType CampaignId CreativeId CreativeType ExchangeBid Outcome
AppSiteCategory Position Age Gender DeviceType CreativeCategory
Arts & Entertainment Above the Fold 12 F Mobile/Tablet Advertising#Cell Phones e81deba4db04e78a6c9d7cdfc10edbc6b90873c5 app 7ba4d649042a1a655f3a3ba02267fa1ab66d17e7 660ec2e1724b3b9b7804c0258970ea26a6d7a4de 0 2015-10-28 00:51:48 Android 5 0 Samsung ... IDN -6.2 106.8 0 User 69891 868690 2 0.18 0
Air Travel#Business Travel 01e57be3a57b4bd48d5d9d94e3e064612ba103db app 7ba4d649042a1a655f3a3ba02267fa1ab66d17e7 a9fe9aff8929976f110b27a8aef236516adaa6d6 0 2015-10-28 00:44:50 Android 5 Samsung SM-G900P Samsung ... USA 32.8 -97.1 0 GPS 57139 835106 2 0.02 0

2 rows × 23 columns


In [13]:
df_corr = df_agg.groupby(level=[7,3,1]).mean()['ExchangeBid']
df_corr.plot


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-13-9f7d5671d168> in <module>()
----> 1 df_corr = df_agg.groupby(level=[7,3,1]).mean()['ExchangeBid']
      2 df_corr.plot

/usr/lib64/python3.4/site-packages/pandas/core/generic.py in groupby(self, by, axis, level, as_index, sort, group_keys, squeeze)
   3322         axis = self._get_axis_number(axis)
   3323         return groupby(self, by=by, axis=axis, level=level, as_index=as_index,
-> 3324                        sort=sort, group_keys=group_keys, squeeze=squeeze)
   3325 
   3326     def asfreq(self, freq, method=None, how=None, normalize=False):

/usr/lib64/python3.4/site-packages/pandas/core/groupby.py in groupby(obj, by, **kwds)
   1250         raise TypeError('invalid type: %s' % type(obj))
   1251 
-> 1252     return klass(obj, by, **kwds)
   1253 
   1254 

/usr/lib64/python3.4/site-packages/pandas/core/groupby.py in __init__(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, squeeze)
    414         if grouper is None:
    415             grouper, exclusions, obj = _get_grouper(obj, keys, axis=axis,
--> 416                                                     level=level, sort=sort)
    417 
    418         self.obj = obj

/usr/lib64/python3.4/site-packages/pandas/core/groupby.py in _get_grouper(obj, key, axis, level, sort)
   2174 
   2175         ping = Grouping(group_axis, gpr, obj=obj, name=name,
-> 2176                         level=level, sort=sort, in_axis=in_axis)
   2177 
   2178         groupings.append(ping)

/usr/lib64/python3.4/site-packages/pandas/core/groupby.py in __init__(self, index, grouper, obj, name, level, sort, in_axis)
   1920                 level = index.names.index(level)
   1921 
-> 1922             inds = index.labels[level]
   1923             level_index = index.levels[level]
   1924 

/usr/lib64/python3.4/site-packages/pandas/core/base.py in __getitem__(self, n)
    220         if isinstance(n, slice):
    221             return self.__class__(super(FrozenList, self).__getitem__(n))
--> 222         return super(FrozenList, self).__getitem__(n)
    223 
    224     def __radd__(self, other):

IndexError: list index out of range

Correlation b/w bid winnings and creative category


In [ ]:
# #df_corr.reset_index()
# df_corr = df.set_index(['Country', 'AppSiteCategory', 'Gender', 'Age']).sortlevel(0)
# df_corr#['ExchangeBid']#.groupby(level=[0,1,2,3]).mean()
df_country_bids = df[['Country', 'CreativeCategory', 'ExchangeBid']].sort_values(by='Country').set_index(['Country', 'CreativeCategory'])
df_con_corr = df_country_bids.groupby(level=1).mean().sort_values(by='ExchangeBid')
df_con_corr.plot(figsize=(16,8), kind='bar', linewidth=2)

In [ ]:
# df.corr(method='pearson') #kendall

In [ ]:
df.set_index('Timestamp').resample('5T', how={'ExchangeBid': pd.np.sum, 'CreativeCategory': 'count'}).plot(figsize=(16,8), kind='bar', linewidth=2)

In [ ]:
sizes = {}
for country in countries: 
    print(country, df_country_bids.loc[country].size)
    # df_country_bids.loc[country]['Count'] = df_country_bids.loc[country].size
    sizes[country] = df_country_bids.loc[country].size

In [ ]:
sizes

In [ ]:
df_country_bids

In [ ]:
df.set_index('Timestamp')[['ExchangeBid']].resample('5T', how='sum')

In [ ]:
# pd.Series(["a","b","c","a"], dtype="category")
df_agg#.swaplevel(0,9).sortlevel(0)
# #df_country = df_country#.swaplevel(0,9)#replace(0, pd.np.nan)
# # USA
# df_usa = df_country.swaplevel(0,8).loc['USA'].sortlevel(level=[0,5,8])
# # df_country
# df_usa = df_usa.reset_index(['Age', 'Gender'])

In [ ]:
df_coun

In [ ]: