In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
import os.path
folder = os.path.join('..', 'airbnb','countries.csv')
In [2]:
df = pd.read_csv(folder, sep=',')
display(df)
country_destination
lat_destination
lng_destination
distance_km
destination_km2
destination_language
language_levenshtein_distance
0
AU
-26.853388
133.275160
15297.7440
7741220.0
eng
0.00
1
CA
62.393303
-96.818146
2828.1333
9984670.0
eng
0.00
2
DE
51.165707
10.452764
7879.5680
357022.0
deu
72.61
3
ES
39.896027
-2.487694
7730.7240
505370.0
spa
92.25
4
FR
46.232193
2.209667
7682.9450
643801.0
fra
92.06
5
GB
54.633220
-3.432277
6883.6590
243610.0
eng
0.00
6
IT
41.873990
12.564167
8636.6310
301340.0
ita
89.40
7
NL
52.133057
5.295250
7524.3203
41543.0
nld
63.22
8
PT
39.553444
-7.839319
7355.2534
92090.0
por
95.45
9
US
36.966427
-95.844030
0.0000
9826675.0
eng
0.00
In [3]:
from pandas.tools.plotting import scatter_matrix
scatter_matrix(df, alpha=0.2, figsize=(10, 10), diagonal='kde')
plt.show()
In [4]:
folder = os.path.join('..', 'airbnb','train_users_2.csv')
df_usr = pd.read_csv(folder, sep=',')
display(df_usr)
id
date_account_created
timestamp_first_active
date_first_booking
gender
age
signup_method
signup_flow
language
affiliate_channel
affiliate_provider
first_affiliate_tracked
signup_app
first_device_type
first_browser
country_destination
0
gxn3p5htnn
2010-06-28
20090319043255
NaN
-unknown-
NaN
facebook
0
en
direct
direct
untracked
Web
Mac Desktop
Chrome
NDF
1
820tgsjxq7
2011-05-25
20090523174809
NaN
MALE
38.0
facebook
0
en
seo
google
untracked
Web
Mac Desktop
Chrome
NDF
2
4ft3gnwmtx
2010-09-28
20090609231247
2010-08-02
FEMALE
56.0
basic
3
en
direct
direct
untracked
Web
Windows Desktop
IE
US
3
bjjt8pjhuk
2011-12-05
20091031060129
2012-09-08
FEMALE
42.0
facebook
0
en
direct
direct
untracked
Web
Mac Desktop
Firefox
other
4
87mebub9p4
2010-09-14
20091208061105
2010-02-18
-unknown-
41.0
basic
0
en
direct
direct
untracked
Web
Mac Desktop
Chrome
US
5
osr2jwljor
2010-01-01
20100101215619
2010-01-02
-unknown-
NaN
basic
0
en
other
other
omg
Web
Mac Desktop
Chrome
US
6
lsw9q7uk0j
2010-01-02
20100102012558
2010-01-05
FEMALE
46.0
basic
0
en
other
craigslist
untracked
Web
Mac Desktop
Safari
US
7
0d01nltbrs
2010-01-03
20100103191905
2010-01-13
FEMALE
47.0
basic
0
en
direct
direct
omg
Web
Mac Desktop
Safari
US
8
a1vcnhxeij
2010-01-04
20100104004211
2010-07-29
FEMALE
50.0
basic
0
en
other
craigslist
untracked
Web
Mac Desktop
Safari
US
9
6uh8zyj2gn
2010-01-04
20100104023758
2010-01-04
-unknown-
46.0
basic
0
en
other
craigslist
omg
Web
Mac Desktop
Firefox
US
10
yuuqmid2rp
2010-01-04
20100104194251
2010-01-06
FEMALE
36.0
basic
0
en
other
craigslist
untracked
Web
Mac Desktop
Firefox
US
11
om1ss59ys8
2010-01-05
20100105051812
NaN
FEMALE
47.0
basic
0
en
other
craigslist
untracked
Web
iPhone
-unknown-
NDF
12
k6np330cm1
2010-01-05
20100105060859
2010-01-18
-unknown-
NaN
basic
0
en
direct
direct
NaN
Web
Other/Unknown
-unknown-
FR
13
dy3rgx56cu
2010-01-05
20100105083259
NaN
FEMALE
37.0
basic
0
en
other
craigslist
linked
Web
Mac Desktop
Firefox
NDF
14
ju3h98ch3w
2010-01-07
20100107055820
NaN
FEMALE
36.0
basic
0
en
other
craigslist
untracked
Web
iPhone
Mobile Safari
NDF
15
v4d5rl22px
2010-01-07
20100107204555
2010-01-08
FEMALE
33.0
basic
0
en
direct
direct
untracked
Web
Windows Desktop
Chrome
CA
16
2dwbwkx056
2010-01-07
20100107215125
NaN
-unknown-
NaN
basic
0
en
other
craigslist
NaN
Web
Other/Unknown
-unknown-
NDF
17
frhre329au
2010-01-07
20100107224625
2010-01-09
-unknown-
31.0
basic
0
en
other
craigslist
NaN
Web
Other/Unknown
-unknown-
US
18
cxlg85pg1r
2010-01-08
20100108015641
NaN
-unknown-
NaN
basic
0
en
seo
facebook
NaN
Web
Other/Unknown
-unknown-
NDF
19
gdka1q5ktd
2010-01-10
20100110010817
2010-01-10
FEMALE
29.0
basic
0
en
direct
direct
untracked
Web
Mac Desktop
Chrome
FR
20
qdubonn3uk
2010-01-10
20100110152120
2010-01-18
-unknown-
NaN
basic
0
en
direct
direct
NaN
Web
Other/Unknown
-unknown-
US
21
qsibmuz9sx
2010-01-10
20100110220941
2010-01-11
MALE
30.0
basic
0
en
direct
direct
linked
Web
Mac Desktop
Chrome
US
22
80f7dwscrn
2010-01-11
20100111031438
2010-01-11
-unknown-
40.0
basic
0
en
seo
google
untracked
Web
iPhone
-unknown-
US
23
jha93x042q
2010-01-11
20100111224015
NaN
-unknown-
NaN
basic
0
en
other
craigslist
untracked
Web
Mac Desktop
Safari
NDF
24
7i49vnuav6
2010-01-11
20100111230808
NaN
FEMALE
40.0
basic
0
en
seo
google
untracked
Web
Mac Desktop
Firefox
NDF
25
al8bcetz0g
2010-01-12
20100112131444
2010-01-15
FEMALE
26.0
basic
0
en
other
craigslist
untracked
Web
Mac Desktop
Chrome
FR
26
bjg0m5otl3
2010-01-12
20100112155420
NaN
-unknown-
NaN
basic
0
en
other
other
untracked
Web
Other/Unknown
-unknown-
NDF
27
hfrl5gle36
2010-01-12
20100112205949
2010-01-22
FEMALE
32.0
basic
0
en
other
craigslist
untracked
Web
Desktop (Other)
Chrome
US
28
tp6x3md0n4
2010-01-13
20100113044650
2010-01-13
-unknown-
35.0
basic
0
en
direct
direct
NaN
Web
Other/Unknown
-unknown-
FR
29
hql77nu2lk
2010-01-13
20100113064333
2010-01-19
-unknown-
37.0
basic
0
en
direct
direct
untracked
Web
Android Tablet
-unknown-
US
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
213421
c98s3h7kgj
2014-06-30
20140630231137
NaN
-unknown-
NaN
basic
0
en
direct
direct
linked
Web
Mac Desktop
Firefox
NDF
213422
ytmpiwb8hj
2014-06-30
20140630231246
NaN
-unknown-
NaN
basic
0
en
direct
direct
untracked
Web
Windows Desktop
IE
NDF
213423
3dx1jk6yk2
2014-06-30
20140630231548
NaN
FEMALE
20.0
facebook
25
en
direct
direct
untracked
iOS
iPhone
-unknown-
NDF
213424
hcfj07iowv
2014-06-30
20140630231859
NaN
FEMALE
32.0
facebook
0
en
direct
direct
linked
Web
Windows Desktop
Chrome
NDF
213425
l1f71f9vsj
2014-06-30
20140630232119
NaN
FEMALE
30.0
facebook
0
en
direct
direct
linked
Web
Windows Desktop
Chrome
NDF
213426
15bj4ahmhf
2014-06-30
20140630232331
NaN
-unknown-
NaN
basic
0
en
direct
direct
untracked
Moweb
Android Phone
Chrome Mobile
NDF
213427
qwpybxfjdl
2014-06-30
20140630232539
NaN
-unknown-
NaN
basic
0
en
direct
direct
linked
Web
Desktop (Other)
Chrome
NDF
213428
k4t61wuvyq
2014-06-30
20140630232634
NaN
-unknown-
NaN
basic
23
en
direct
direct
untracked
Android
Android Phone
-unknown-
NDF
213429
mhh7b52z44
2014-06-30
20140630232712
NaN
-unknown-
NaN
basic
25
en
direct
direct
untracked
iOS
iPhone
-unknown-
NDF
213430
79wk7k2k5t
2014-06-30
20140630233132
NaN
-unknown-
19.0
basic
0
en
direct
direct
linked
Web
Mac Desktop
Chrome
NDF
213431
ftwmocvwlq
2014-06-30
20140630233203
NaN
-unknown-
NaN
basic
0
en
direct
direct
untracked
Web
Windows Desktop
Firefox
NDF
213432
rg7ayg1tob
2014-06-30
20140630233224
NaN
MALE
31.0
facebook
0
en
direct
direct
tracked-other
Web
Mac Desktop
Safari
NDF
213433
2f24umzkuv
2014-06-30
20140630233427
NaN
-unknown-
NaN
basic
0
en
sem-brand
google
untracked
Web
iPad
Mobile Safari
NDF
213434
or77n2ojuj
2014-06-30
20140630233640
NaN
-unknown-
NaN
basic
0
en
seo
facebook
product
Web
Mac Desktop
Chrome
NDF
213435
0a5bnb9bs4
2014-06-30
20140630233851
NaN
-unknown-
NaN
basic
0
en
seo
google
untracked
Web
Windows Desktop
Chrome
NDF
213436
6fzrn49sfn
2014-06-30
20140630234113
NaN
-unknown-
NaN
basic
25
en
direct
direct
untracked
iOS
iPhone
-unknown-
NDF
213437
r0jq0devgy
2014-06-30
20140630234243
NaN
-unknown-
NaN
basic
23
en
direct
direct
untracked
Android
Android Tablet
-unknown-
NDF
213438
v5lq9bj8gv
2014-06-30
20140630234429
NaN
-unknown-
NaN
basic
25
en
direct
direct
untracked
iOS
iPhone
-unknown-
NDF
213439
msucfwmlzc
2014-06-30
20140630234729
2015-03-16
MALE
43.0
basic
0
en
direct
direct
untracked
Web
Windows Desktop
Firefox
US
213440
04y8115avm
2014-06-30
20140630234933
NaN
FEMALE
24.0
basic
25
en
direct
direct
untracked
iOS
iPhone
Mobile Safari
NDF
213441
omlc9iku7t
2014-06-30
20140630235151
2014-08-13
FEMALE
34.0
basic
0
en
direct
direct
linked
Web
Mac Desktop
Chrome
ES
213442
rf0ay567js
2014-06-30
20140630235309
NaN
-unknown-
NaN
basic
0
en
sem-brand
google
omg
Web
Mac Desktop
Chrome
NDF
213443
0k26r3mir0
2014-06-30
20140630235340
2014-07-13
FEMALE
36.0
basic
0
en
sem-brand
google
linked
Web
Mac Desktop
Safari
US
213444
40o1ivh6cb
2014-06-30
20140630235352
NaN
-unknown-
NaN
basic
0
en
direct
direct
linked
Web
Windows Desktop
Chrome
NDF
213445
qbxza0xojf
2014-06-30
20140630235547
2014-07-02
FEMALE
23.0
basic
0
en
sem-brand
google
omg
Web
Windows Desktop
IE
US
213446
zxodksqpep
2014-06-30
20140630235636
NaN
MALE
32.0
basic
0
en
sem-brand
google
omg
Web
Mac Desktop
Safari
NDF
213447
mhewnxesx9
2014-06-30
20140630235719
NaN
-unknown-
NaN
basic
0
en
direct
direct
linked
Web
Windows Desktop
Chrome
NDF
213448
6o3arsjbb4
2014-06-30
20140630235754
NaN
-unknown-
32.0
basic
0
en
direct
direct
untracked
Web
Mac Desktop
Firefox
NDF
213449
jh95kwisub
2014-06-30
20140630235822
NaN
-unknown-
NaN
basic
25
en
other
other
tracked-other
iOS
iPhone
Mobile Safari
NDF
213450
nw9fwlyb5f
2014-06-30
20140630235824
NaN
-unknown-
NaN
basic
25
en
direct
direct
untracked
iOS
iPhone
-unknown-
NDF
213451 rows × 16 columns
In [35]:
a = df_usr.groupby(['signup_method','country_destination'])['signup_method'].count().unstack('country_destination').fillna(0)
a.plot(kind='bar',stacked=True,figsize=(20,10))
Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x115f3ae80>
In [37]:
from datetime import datetime
def convert_time(row):
return datetime.strptime(row['date_account_created'], '%Y-%m-%d')
df_usr['date_account_created'] = df_usr.apply(convert_time, axis=1)
In [46]:
NonBookersPercentage = df_usr['date_first_booking'].isnull().sum()/len(df_usr)
print('Percentage of non bookers',NonBookersPercentage*100,'%')
Percentage of non bookers 58.3473490403 %
In [40]:
from datetime import datetime
df_usr.loc[df_usr['date_first_booking']=='NaN', 'TargetClass'] = 'Org: Politics'
def convert_time1(row):
return datetime.strptime(row['date_first_booking'], '%Y-%m-%d')
df_usr['date_first_booking'] = df_usr.apply(convert_time1, axis=1)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-40-e80cbbd9c417> in <module>()
4 return datetime.strptime(row['date_first_booking'], '%Y-%m-%d')
5
----> 6 df_usr['date_first_booking'] = df_usr.apply(convert_time1, axis=1)
/Users/malogrisard/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
4059 if reduce is None:
4060 reduce = True
-> 4061 return self._apply_standard(f, axis, reduce=reduce)
4062 else:
4063 return self._apply_broadcast(f, axis)
/Users/malogrisard/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
4155 try:
4156 for i, v in enumerate(series_gen):
-> 4157 results[i] = func(v)
4158 keys.append(v.name)
4159 except Exception as e:
<ipython-input-40-e80cbbd9c417> in convert_time1(row)
2
3 def convert_time1(row):
----> 4 return datetime.strptime(row['date_first_booking'], '%Y-%m-%d')
5
6 df_usr['date_first_booking'] = df_usr.apply(convert_time1, axis=1)
TypeError: ('strptime() argument 1 must be str, not float', 'occurred at index 0')
In [28]:
a = df_usr.groupby(['date_account_created','country_destination'])['date_account_created'].count().unstack('country_destination').fillna(0)
a.plot(kind='bar',stacked=True,figsize=(20,10))
Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x11624dcf8>
In [32]:
import copy
df_2 = copy.deepcopy(df_usr)
#df_2.first_device_type = pd.get_dummies(df_2.first_device_type)
df_2.country_destination = pd.get_dummies(df_2.country_destination)
In [8]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df_2.gender)
df_2.gender = le.transform(df_2.gender) # 0 unkn, 1 Male, 2 Female
In [14]:
df_2[:5]
Out[14]:
id
date_account_created
timestamp_first_active
date_first_booking
gender
age
signup_method
signup_flow
language
affiliate_channel
affiliate_provider
first_affiliate_tracked
signup_app
first_device_type
first_browser
country_destination
0
gxn3p5htnn
2010-06-28
20090319043255
NaN
0
NaN
facebook
0
en
direct
direct
untracked
Web
Mac Desktop
Chrome
0.0
1
820tgsjxq7
2011-05-25
20090523174809
NaN
2
38.0
facebook
0
en
seo
google
untracked
Web
Mac Desktop
Chrome
0.0
2
4ft3gnwmtx
2010-09-28
20090609231247
2010-08-02
1
56.0
basic
3
en
direct
direct
untracked
Web
Windows Desktop
IE
0.0
3
bjjt8pjhuk
2011-12-05
20091031060129
2012-09-08
1
42.0
facebook
0
en
direct
direct
untracked
Web
Mac Desktop
Firefox
0.0
4
87mebub9p4
2010-09-14
20091208061105
2010-02-18
0
41.0
basic
0
en
direct
direct
untracked
Web
Mac Desktop
Chrome
0.0
In [34]:
from datetime import datetime
def convert_time(row):
return datetime.strptime(row['date_account_created'], '%Y-%m-%d')
df_2['date_account_created'] = df_2.apply(convert_time, axis=1)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-34-d335c2505b3e> in <module>()
4 return datetime.strptime(row['date_account_created'], '%Y-%m-%d')
5
----> 6 df_2['date_account_created'] = df_2.apply(convert_time, axis=1)
7
/Users/malogrisard/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
4059 if reduce is None:
4060 reduce = True
-> 4061 return self._apply_standard(f, axis, reduce=reduce)
4062 else:
4063 return self._apply_broadcast(f, axis)
/Users/malogrisard/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
4155 try:
4156 for i, v in enumerate(series_gen):
-> 4157 results[i] = func(v)
4158 keys.append(v.name)
4159 except Exception as e:
<ipython-input-34-d335c2505b3e> in convert_time(row)
2
3 def convert_time(row):
----> 4 return datetime.strptime(row['date_account_created'], '%Y-%m-%d')
5
6 df_2['date_account_created'] = df_2.apply(convert_time, axis=1)
TypeError: ('strptime() argument 1 must be str, not Timestamp', 'occurred at index 0')
In [18]:
df_2.describe()
/Users/malogrisard/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
RuntimeWarning)
Out[18]:
timestamp_first_active
gender
age
signup_flow
country_destination
count
2.134510e+05
213451.000000
125461.000000
213451.000000
213451.000000
mean
2.013085e+13
0.809399
49.668335
3.267387
0.002525
std
9.253717e+09
0.819937
155.666612
7.637707
0.050188
min
2.009032e+13
0.000000
1.000000
0.000000
0.000000
25%
2.012123e+13
0.000000
NaN
0.000000
0.000000
50%
2.013091e+13
1.000000
NaN
0.000000
0.000000
75%
2.014031e+13
2.000000
NaN
0.000000
0.000000
max
2.014063e+13
3.000000
2014.000000
25.000000
1.000000
In [ ]:
In [19]:
scatter_matrix(df_2, alpha=0.2, figsize=(10, 10), diagonal='kde')
plt.show()
In [21]:
from sklearn.decomposition import PCA
pca = PCA(n_components=§§§§)
pca.fit(df_2)
/Users/malogrisard/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
DeprecationWarning)
Out[21]:
PCA(copy=True, n_components=None, whiten=False)
In [25]:
import seaborn as sns
sns.set()
sns.pairplot(df_2, hue="gender")
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-25-ad321485df97> in <module>()
2 sns.set()
3
----> 4 sns.pairplot(df_2, hue="gender")
/Users/malogrisard/anaconda/lib/python3.5/site-packages/seaborn/linearmodels.py in pairplot(data, hue, hue_order, palette, vars, x_vars, y_vars, kind, diag_kind, markers, size, aspect, dropna, plot_kws, diag_kws, grid_kws)
1607 if grid.square_grid:
1608 if diag_kind == "hist":
-> 1609 grid.map_diag(plt.hist, **diag_kws)
1610 elif diag_kind == "kde":
1611 diag_kws["legend"] = False
/Users/malogrisard/anaconda/lib/python3.5/site-packages/seaborn/axisgrid.py in map_diag(self, func, **kwargs)
1346 else:
1347 func(vals, color=self.palette, histtype="barstacked",
-> 1348 **kwargs)
1349 else:
1350 for k, label_k in enumerate(self.hue_names):
/Users/malogrisard/anaconda/lib/python3.5/site-packages/matplotlib/pyplot.py in hist(x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, hold, data, **kwargs)
2963 histtype=histtype, align=align, orientation=orientation,
2964 rwidth=rwidth, log=log, color=color, label=label,
-> 2965 stacked=stacked, data=data, **kwargs)
2966 finally:
2967 ax.hold(washold)
/Users/malogrisard/anaconda/lib/python3.5/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs)
1817 warnings.warn(msg % (label_namer, func.__name__),
1818 RuntimeWarning, stacklevel=2)
-> 1819 return func(ax, *args, **kwargs)
1820 pre_doc = inner.__doc__
1821 if pre_doc is None:
/Users/malogrisard/anaconda/lib/python3.5/site-packages/matplotlib/axes/_axes.py in hist(self, x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs)
5983 # this will automatically overwrite bins,
5984 # so that each histogram uses the same bins
-> 5985 m, bins = np.histogram(x[i], bins, weights=w[i], **hist_kwargs)
5986 m = m.astype(float) # causes problems later if it's an int
5987 if mlast is None:
/Users/malogrisard/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py in histogram(a, bins, range, normed, weights, density)
500 if mn > mx:
501 raise ValueError(
--> 502 'max must be larger than min in range parameter.')
503 if not np.all(np.isfinite([mn, mx])):
504 raise ValueError(
ValueError: max must be larger than min in range parameter.
In [ ]:
Content source: malogrisard/NTDScourse
Similar notebooks: