In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.display import display
import os.path

folder = os.path.join('..', 'airbnb','countries.csv')

In [2]:
df = pd.read_csv(folder, sep=',')
display(df)


country_destination lat_destination lng_destination distance_km destination_km2 destination_language language_levenshtein_distance
0 AU -26.853388 133.275160 15297.7440 7741220.0 eng 0.00
1 CA 62.393303 -96.818146 2828.1333 9984670.0 eng 0.00
2 DE 51.165707 10.452764 7879.5680 357022.0 deu 72.61
3 ES 39.896027 -2.487694 7730.7240 505370.0 spa 92.25
4 FR 46.232193 2.209667 7682.9450 643801.0 fra 92.06
5 GB 54.633220 -3.432277 6883.6590 243610.0 eng 0.00
6 IT 41.873990 12.564167 8636.6310 301340.0 ita 89.40
7 NL 52.133057 5.295250 7524.3203 41543.0 nld 63.22
8 PT 39.553444 -7.839319 7355.2534 92090.0 por 95.45
9 US 36.966427 -95.844030 0.0000 9826675.0 eng 0.00

In [3]:
from pandas.tools.plotting import scatter_matrix
scatter_matrix(df, alpha=0.2, figsize=(10, 10), diagonal='kde')
plt.show()



In [4]:
folder = os.path.join('..', 'airbnb','train_users_2.csv')
df_usr = pd.read_csv(folder, sep=',')
display(df_usr)


id date_account_created timestamp_first_active date_first_booking gender age signup_method signup_flow language affiliate_channel affiliate_provider first_affiliate_tracked signup_app first_device_type first_browser country_destination
0 gxn3p5htnn 2010-06-28 20090319043255 NaN -unknown- NaN facebook 0 en direct direct untracked Web Mac Desktop Chrome NDF
1 820tgsjxq7 2011-05-25 20090523174809 NaN MALE 38.0 facebook 0 en seo google untracked Web Mac Desktop Chrome NDF
2 4ft3gnwmtx 2010-09-28 20090609231247 2010-08-02 FEMALE 56.0 basic 3 en direct direct untracked Web Windows Desktop IE US
3 bjjt8pjhuk 2011-12-05 20091031060129 2012-09-08 FEMALE 42.0 facebook 0 en direct direct untracked Web Mac Desktop Firefox other
4 87mebub9p4 2010-09-14 20091208061105 2010-02-18 -unknown- 41.0 basic 0 en direct direct untracked Web Mac Desktop Chrome US
5 osr2jwljor 2010-01-01 20100101215619 2010-01-02 -unknown- NaN basic 0 en other other omg Web Mac Desktop Chrome US
6 lsw9q7uk0j 2010-01-02 20100102012558 2010-01-05 FEMALE 46.0 basic 0 en other craigslist untracked Web Mac Desktop Safari US
7 0d01nltbrs 2010-01-03 20100103191905 2010-01-13 FEMALE 47.0 basic 0 en direct direct omg Web Mac Desktop Safari US
8 a1vcnhxeij 2010-01-04 20100104004211 2010-07-29 FEMALE 50.0 basic 0 en other craigslist untracked Web Mac Desktop Safari US
9 6uh8zyj2gn 2010-01-04 20100104023758 2010-01-04 -unknown- 46.0 basic 0 en other craigslist omg Web Mac Desktop Firefox US
10 yuuqmid2rp 2010-01-04 20100104194251 2010-01-06 FEMALE 36.0 basic 0 en other craigslist untracked Web Mac Desktop Firefox US
11 om1ss59ys8 2010-01-05 20100105051812 NaN FEMALE 47.0 basic 0 en other craigslist untracked Web iPhone -unknown- NDF
12 k6np330cm1 2010-01-05 20100105060859 2010-01-18 -unknown- NaN basic 0 en direct direct NaN Web Other/Unknown -unknown- FR
13 dy3rgx56cu 2010-01-05 20100105083259 NaN FEMALE 37.0 basic 0 en other craigslist linked Web Mac Desktop Firefox NDF
14 ju3h98ch3w 2010-01-07 20100107055820 NaN FEMALE 36.0 basic 0 en other craigslist untracked Web iPhone Mobile Safari NDF
15 v4d5rl22px 2010-01-07 20100107204555 2010-01-08 FEMALE 33.0 basic 0 en direct direct untracked Web Windows Desktop Chrome CA
16 2dwbwkx056 2010-01-07 20100107215125 NaN -unknown- NaN basic 0 en other craigslist NaN Web Other/Unknown -unknown- NDF
17 frhre329au 2010-01-07 20100107224625 2010-01-09 -unknown- 31.0 basic 0 en other craigslist NaN Web Other/Unknown -unknown- US
18 cxlg85pg1r 2010-01-08 20100108015641 NaN -unknown- NaN basic 0 en seo facebook NaN Web Other/Unknown -unknown- NDF
19 gdka1q5ktd 2010-01-10 20100110010817 2010-01-10 FEMALE 29.0 basic 0 en direct direct untracked Web Mac Desktop Chrome FR
20 qdubonn3uk 2010-01-10 20100110152120 2010-01-18 -unknown- NaN basic 0 en direct direct NaN Web Other/Unknown -unknown- US
21 qsibmuz9sx 2010-01-10 20100110220941 2010-01-11 MALE 30.0 basic 0 en direct direct linked Web Mac Desktop Chrome US
22 80f7dwscrn 2010-01-11 20100111031438 2010-01-11 -unknown- 40.0 basic 0 en seo google untracked Web iPhone -unknown- US
23 jha93x042q 2010-01-11 20100111224015 NaN -unknown- NaN basic 0 en other craigslist untracked Web Mac Desktop Safari NDF
24 7i49vnuav6 2010-01-11 20100111230808 NaN FEMALE 40.0 basic 0 en seo google untracked Web Mac Desktop Firefox NDF
25 al8bcetz0g 2010-01-12 20100112131444 2010-01-15 FEMALE 26.0 basic 0 en other craigslist untracked Web Mac Desktop Chrome FR
26 bjg0m5otl3 2010-01-12 20100112155420 NaN -unknown- NaN basic 0 en other other untracked Web Other/Unknown -unknown- NDF
27 hfrl5gle36 2010-01-12 20100112205949 2010-01-22 FEMALE 32.0 basic 0 en other craigslist untracked Web Desktop (Other) Chrome US
28 tp6x3md0n4 2010-01-13 20100113044650 2010-01-13 -unknown- 35.0 basic 0 en direct direct NaN Web Other/Unknown -unknown- FR
29 hql77nu2lk 2010-01-13 20100113064333 2010-01-19 -unknown- 37.0 basic 0 en direct direct untracked Web Android Tablet -unknown- US
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
213421 c98s3h7kgj 2014-06-30 20140630231137 NaN -unknown- NaN basic 0 en direct direct linked Web Mac Desktop Firefox NDF
213422 ytmpiwb8hj 2014-06-30 20140630231246 NaN -unknown- NaN basic 0 en direct direct untracked Web Windows Desktop IE NDF
213423 3dx1jk6yk2 2014-06-30 20140630231548 NaN FEMALE 20.0 facebook 25 en direct direct untracked iOS iPhone -unknown- NDF
213424 hcfj07iowv 2014-06-30 20140630231859 NaN FEMALE 32.0 facebook 0 en direct direct linked Web Windows Desktop Chrome NDF
213425 l1f71f9vsj 2014-06-30 20140630232119 NaN FEMALE 30.0 facebook 0 en direct direct linked Web Windows Desktop Chrome NDF
213426 15bj4ahmhf 2014-06-30 20140630232331 NaN -unknown- NaN basic 0 en direct direct untracked Moweb Android Phone Chrome Mobile NDF
213427 qwpybxfjdl 2014-06-30 20140630232539 NaN -unknown- NaN basic 0 en direct direct linked Web Desktop (Other) Chrome NDF
213428 k4t61wuvyq 2014-06-30 20140630232634 NaN -unknown- NaN basic 23 en direct direct untracked Android Android Phone -unknown- NDF
213429 mhh7b52z44 2014-06-30 20140630232712 NaN -unknown- NaN basic 25 en direct direct untracked iOS iPhone -unknown- NDF
213430 79wk7k2k5t 2014-06-30 20140630233132 NaN -unknown- 19.0 basic 0 en direct direct linked Web Mac Desktop Chrome NDF
213431 ftwmocvwlq 2014-06-30 20140630233203 NaN -unknown- NaN basic 0 en direct direct untracked Web Windows Desktop Firefox NDF
213432 rg7ayg1tob 2014-06-30 20140630233224 NaN MALE 31.0 facebook 0 en direct direct tracked-other Web Mac Desktop Safari NDF
213433 2f24umzkuv 2014-06-30 20140630233427 NaN -unknown- NaN basic 0 en sem-brand google untracked Web iPad Mobile Safari NDF
213434 or77n2ojuj 2014-06-30 20140630233640 NaN -unknown- NaN basic 0 en seo facebook product Web Mac Desktop Chrome NDF
213435 0a5bnb9bs4 2014-06-30 20140630233851 NaN -unknown- NaN basic 0 en seo google untracked Web Windows Desktop Chrome NDF
213436 6fzrn49sfn 2014-06-30 20140630234113 NaN -unknown- NaN basic 25 en direct direct untracked iOS iPhone -unknown- NDF
213437 r0jq0devgy 2014-06-30 20140630234243 NaN -unknown- NaN basic 23 en direct direct untracked Android Android Tablet -unknown- NDF
213438 v5lq9bj8gv 2014-06-30 20140630234429 NaN -unknown- NaN basic 25 en direct direct untracked iOS iPhone -unknown- NDF
213439 msucfwmlzc 2014-06-30 20140630234729 2015-03-16 MALE 43.0 basic 0 en direct direct untracked Web Windows Desktop Firefox US
213440 04y8115avm 2014-06-30 20140630234933 NaN FEMALE 24.0 basic 25 en direct direct untracked iOS iPhone Mobile Safari NDF
213441 omlc9iku7t 2014-06-30 20140630235151 2014-08-13 FEMALE 34.0 basic 0 en direct direct linked Web Mac Desktop Chrome ES
213442 rf0ay567js 2014-06-30 20140630235309 NaN -unknown- NaN basic 0 en sem-brand google omg Web Mac Desktop Chrome NDF
213443 0k26r3mir0 2014-06-30 20140630235340 2014-07-13 FEMALE 36.0 basic 0 en sem-brand google linked Web Mac Desktop Safari US
213444 40o1ivh6cb 2014-06-30 20140630235352 NaN -unknown- NaN basic 0 en direct direct linked Web Windows Desktop Chrome NDF
213445 qbxza0xojf 2014-06-30 20140630235547 2014-07-02 FEMALE 23.0 basic 0 en sem-brand google omg Web Windows Desktop IE US
213446 zxodksqpep 2014-06-30 20140630235636 NaN MALE 32.0 basic 0 en sem-brand google omg Web Mac Desktop Safari NDF
213447 mhewnxesx9 2014-06-30 20140630235719 NaN -unknown- NaN basic 0 en direct direct linked Web Windows Desktop Chrome NDF
213448 6o3arsjbb4 2014-06-30 20140630235754 NaN -unknown- 32.0 basic 0 en direct direct untracked Web Mac Desktop Firefox NDF
213449 jh95kwisub 2014-06-30 20140630235822 NaN -unknown- NaN basic 25 en other other tracked-other iOS iPhone Mobile Safari NDF
213450 nw9fwlyb5f 2014-06-30 20140630235824 NaN -unknown- NaN basic 25 en direct direct untracked iOS iPhone -unknown- NDF

213451 rows × 16 columns


In [35]:
a = df_usr.groupby(['signup_method','country_destination'])['signup_method'].count().unstack('country_destination').fillna(0)
a.plot(kind='bar',stacked=True,figsize=(20,10))


Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x115f3ae80>

In [37]:
from datetime import datetime

def convert_time(row):
    return datetime.strptime(row['date_account_created'], '%Y-%m-%d')

df_usr['date_account_created'] = df_usr.apply(convert_time, axis=1)

In [46]:
NonBookersPercentage = df_usr['date_first_booking'].isnull().sum()/len(df_usr)
print('Percentage of non bookers',NonBookersPercentage*100,'%')


Percentage of non bookers 58.3473490403 %

In [40]:
from datetime import datetime

df_usr.loc[df_usr['date_first_booking']=='NaN', 'TargetClass'] = 'Org: Politics'

def convert_time1(row):
    return datetime.strptime(row['date_first_booking'], '%Y-%m-%d')

df_usr['date_first_booking'] = df_usr.apply(convert_time1, axis=1)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-40-e80cbbd9c417> in <module>()
      4     return datetime.strptime(row['date_first_booking'], '%Y-%m-%d')
      5 
----> 6 df_usr['date_first_booking'] = df_usr.apply(convert_time1, axis=1)

/Users/malogrisard/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
   4059                     if reduce is None:
   4060                         reduce = True
-> 4061                     return self._apply_standard(f, axis, reduce=reduce)
   4062             else:
   4063                 return self._apply_broadcast(f, axis)

/Users/malogrisard/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
   4155             try:
   4156                 for i, v in enumerate(series_gen):
-> 4157                     results[i] = func(v)
   4158                     keys.append(v.name)
   4159             except Exception as e:

<ipython-input-40-e80cbbd9c417> in convert_time1(row)
      2 
      3 def convert_time1(row):
----> 4     return datetime.strptime(row['date_first_booking'], '%Y-%m-%d')
      5 
      6 df_usr['date_first_booking'] = df_usr.apply(convert_time1, axis=1)

TypeError: ('strptime() argument 1 must be str, not float', 'occurred at index 0')

In [28]:
a = df_usr.groupby(['date_account_created','country_destination'])['date_account_created'].count().unstack('country_destination').fillna(0)
a.plot(kind='bar',stacked=True,figsize=(20,10))


Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x11624dcf8>

In [32]:
import copy
df_2 = copy.deepcopy(df_usr)

#df_2.first_device_type = pd.get_dummies(df_2.first_device_type)
df_2.country_destination = pd.get_dummies(df_2.country_destination)

In [8]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df_2.gender)
df_2.gender = le.transform(df_2.gender) # 0 unkn, 1 Male, 2 Female

In [14]:
df_2[:5]


Out[14]:
id date_account_created timestamp_first_active date_first_booking gender age signup_method signup_flow language affiliate_channel affiliate_provider first_affiliate_tracked signup_app first_device_type first_browser country_destination
0 gxn3p5htnn 2010-06-28 20090319043255 NaN 0 NaN facebook 0 en direct direct untracked Web Mac Desktop Chrome 0.0
1 820tgsjxq7 2011-05-25 20090523174809 NaN 2 38.0 facebook 0 en seo google untracked Web Mac Desktop Chrome 0.0
2 4ft3gnwmtx 2010-09-28 20090609231247 2010-08-02 1 56.0 basic 3 en direct direct untracked Web Windows Desktop IE 0.0
3 bjjt8pjhuk 2011-12-05 20091031060129 2012-09-08 1 42.0 facebook 0 en direct direct untracked Web Mac Desktop Firefox 0.0
4 87mebub9p4 2010-09-14 20091208061105 2010-02-18 0 41.0 basic 0 en direct direct untracked Web Mac Desktop Chrome 0.0

In [34]:
from datetime import datetime

def convert_time(row):
    return datetime.strptime(row['date_account_created'], '%Y-%m-%d')

df_2['date_account_created'] = df_2.apply(convert_time, axis=1)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-34-d335c2505b3e> in <module>()
      4     return datetime.strptime(row['date_account_created'], '%Y-%m-%d')
      5 
----> 6 df_2['date_account_created'] = df_2.apply(convert_time, axis=1)
      7 

/Users/malogrisard/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
   4059                     if reduce is None:
   4060                         reduce = True
-> 4061                     return self._apply_standard(f, axis, reduce=reduce)
   4062             else:
   4063                 return self._apply_broadcast(f, axis)

/Users/malogrisard/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
   4155             try:
   4156                 for i, v in enumerate(series_gen):
-> 4157                     results[i] = func(v)
   4158                     keys.append(v.name)
   4159             except Exception as e:

<ipython-input-34-d335c2505b3e> in convert_time(row)
      2 
      3 def convert_time(row):
----> 4     return datetime.strptime(row['date_account_created'], '%Y-%m-%d')
      5 
      6 df_2['date_account_created'] = df_2.apply(convert_time, axis=1)

TypeError: ('strptime() argument 1 must be str, not Timestamp', 'occurred at index 0')

In [18]:
df_2.describe()


/Users/malogrisard/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[18]:
timestamp_first_active gender age signup_flow country_destination
count 2.134510e+05 213451.000000 125461.000000 213451.000000 213451.000000
mean 2.013085e+13 0.809399 49.668335 3.267387 0.002525
std 9.253717e+09 0.819937 155.666612 7.637707 0.050188
min 2.009032e+13 0.000000 1.000000 0.000000 0.000000
25% 2.012123e+13 0.000000 NaN 0.000000 0.000000
50% 2.013091e+13 1.000000 NaN 0.000000 0.000000
75% 2.014031e+13 2.000000 NaN 0.000000 0.000000
max 2.014063e+13 3.000000 2014.000000 25.000000 1.000000

In [ ]:


In [19]:
scatter_matrix(df_2, alpha=0.2, figsize=(10, 10), diagonal='kde')
plt.show()



In [21]:
from sklearn.decomposition import PCA
pca = PCA(n_components=§§§§)
pca.fit(df_2)


/Users/malogrisard/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
Out[21]:
PCA(copy=True, n_components=None, whiten=False)

In [25]:
import seaborn as sns
sns.set()

sns.pairplot(df_2, hue="gender")


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-25-ad321485df97> in <module>()
      2 sns.set()
      3 
----> 4 sns.pairplot(df_2, hue="gender")

/Users/malogrisard/anaconda/lib/python3.5/site-packages/seaborn/linearmodels.py in pairplot(data, hue, hue_order, palette, vars, x_vars, y_vars, kind, diag_kind, markers, size, aspect, dropna, plot_kws, diag_kws, grid_kws)
   1607     if grid.square_grid:
   1608         if diag_kind == "hist":
-> 1609             grid.map_diag(plt.hist, **diag_kws)
   1610         elif diag_kind == "kde":
   1611             diag_kws["legend"] = False

/Users/malogrisard/anaconda/lib/python3.5/site-packages/seaborn/axisgrid.py in map_diag(self, func, **kwargs)
   1346                 else:
   1347                     func(vals, color=self.palette, histtype="barstacked",
-> 1348                          **kwargs)
   1349             else:
   1350                 for k, label_k in enumerate(self.hue_names):

/Users/malogrisard/anaconda/lib/python3.5/site-packages/matplotlib/pyplot.py in hist(x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, hold, data, **kwargs)
   2963                       histtype=histtype, align=align, orientation=orientation,
   2964                       rwidth=rwidth, log=log, color=color, label=label,
-> 2965                       stacked=stacked, data=data, **kwargs)
   2966     finally:
   2967         ax.hold(washold)

/Users/malogrisard/anaconda/lib/python3.5/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs)
   1817                     warnings.warn(msg % (label_namer, func.__name__),
   1818                                   RuntimeWarning, stacklevel=2)
-> 1819             return func(ax, *args, **kwargs)
   1820         pre_doc = inner.__doc__
   1821         if pre_doc is None:

/Users/malogrisard/anaconda/lib/python3.5/site-packages/matplotlib/axes/_axes.py in hist(self, x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs)
   5983             # this will automatically overwrite bins,
   5984             # so that each histogram uses the same bins
-> 5985             m, bins = np.histogram(x[i], bins, weights=w[i], **hist_kwargs)
   5986             m = m.astype(float)  # causes problems later if it's an int
   5987             if mlast is None:

/Users/malogrisard/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py in histogram(a, bins, range, normed, weights, density)
    500     if mn > mx:
    501         raise ValueError(
--> 502             'max must be larger than min in range parameter.')
    503     if not np.all(np.isfinite([mn, mx])):
    504         raise ValueError(

ValueError: max must be larger than min in range parameter.

In [ ]: