In this assignment your challenge is to do some basic analysis for Airbnb. Provided in hw/data/ there are 2 data files, bookings.csv and listings.csv. The objective is to practice data munging and begin our exploration of regression.
In [60]:
# Standard imports for data analysis packages in Python
import pandas as pd
import numpy as np
import seaborn as sns # for pretty layout of plots
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline
In [61]:
listings = pd.read_csv('../data/listings.csv', delimiter=",")
bookings = pd.read_csv('../data/bookings.csv', parse_dates=['booking_date'])
In [62]:
listings.head()
Out[62]:
prop_id
prop_type
neighborhood
price
person_capacity
picture_count
description_length
tenure_months
0
1
Property type 1
Neighborhood 14
140
3
11
232
30
1
2
Property type 1
Neighborhood 14
95
2
3
37
29
2
3
Property type 2
Neighborhood 16
95
2
16
172
29
3
4
Property type 2
Neighborhood 13
90
2
19
472
28
4
5
Property type 1
Neighborhood 15
125
5
21
442
28
In [63]:
bookings.head()
Out[63]:
prop_id
booking_date
0
9
2011-06-17
1
13
2011-08-12
2
21
2011-06-20
3
28
2011-05-05
4
29
2011-11-17
In [64]:
listings.describe().drop('prop_id',1)
Out[64]:
price
person_capacity
picture_count
description_length
tenure_months
count
408.000000
408.000000
408.000000
408.000000
408.000000
mean
187.806373
2.997549
14.389706
309.159314
8.487745
std
353.050858
1.594676
10.477428
228.021684
5.872088
min
39.000000
1.000000
1.000000
0.000000
1.000000
25%
90.000000
2.000000
6.000000
179.000000
4.000000
50%
125.000000
2.000000
12.000000
250.000000
7.000000
75%
199.000000
4.000000
20.000000
389.500000
13.000000
max
5000.000000
10.000000
71.000000
1969.000000
30.000000
In [65]:
#listings.ix[:,3:]
listings.groupby(['prop_type']).describe().ix[:,0:6].drop('prop_id', 1)
Out[65]:
description_length
person_capacity
picture_count
price
tenure_months
prop_type
Property type 1
count
269.000000
269.000000
269.000000
269.000000
269.000000
mean
313.171004
3.516729
14.695167
237.085502
8.464684
std
214.769141
1.644955
10.623651
425.710534
5.773367
min
17.000000
1.000000
1.000000
40.000000
1.000000
25%
193.000000
2.000000
6.000000
120.000000
4.000000
50%
266.000000
3.000000
12.000000
150.000000
7.000000
75%
388.000000
4.000000
20.000000
229.000000
14.000000
max
1719.000000
10.000000
71.000000
5000.000000
30.000000
Property type 2
count
135.000000
135.000000
135.000000
135.000000
135.000000
mean
304.851852
2.000000
13.948148
93.288889
8.377778
std
255.135332
0.846415
10.255191
42.261246
5.963654
min
0.000000
1.000000
1.000000
39.000000
1.000000
25%
150.500000
2.000000
6.500000
69.000000
4.000000
50%
239.000000
2.000000
11.000000
89.000000
7.000000
75%
402.500000
2.000000
19.500000
99.000000
10.500000
max
1969.000000
6.000000
56.000000
350.000000
29.000000
Property type 3
count
4.000000
4.000000
4.000000
4.000000
4.000000
mean
184.750000
1.750000
8.750000
63.750000
13.750000
std
53.093471
0.500000
7.320064
16.520190
8.616844
min
113.000000
1.000000
1.000000
40.000000
5.000000
25%
170.000000
1.750000
3.250000
58.750000
7.250000
50%
192.500000
2.000000
9.500000
70.000000
13.500000
75%
207.250000
2.000000
15.000000
75.000000
20.000000
max
241.000000
2.000000
15.000000
75.000000
23.000000
In [66]:
#data.groupby([‘col1’, ‘col2’])[‘col3’].mean()
listings.groupby(['neighborhood','prop_type']).describe().drop('prop_id',1)
Out[66]:
description_length
person_capacity
picture_count
price
tenure_months
neighborhood
prop_type
Neighborhood 1
Property type 1
count
1.000000
1.000000
1.000000
1.000000
1.000000
mean
209.000000
2.000000
26.000000
85.000000
6.000000
std
NaN
NaN
NaN
NaN
NaN
min
209.000000
2.000000
26.000000
85.000000
6.000000
25%
209.000000
2.000000
26.000000
85.000000
6.000000
50%
209.000000
2.000000
26.000000
85.000000
6.000000
75%
209.000000
2.000000
26.000000
85.000000
6.000000
max
209.000000
2.000000
26.000000
85.000000
6.000000
Neighborhood 10
Property type 1
count
6.000000
6.000000
6.000000
6.000000
6.000000
mean
391.000000
3.500000
13.333333
142.500000
3.833333
std
146.929915
1.224745
8.571270
36.979724
1.602082
min
160.000000
2.000000
4.000000
90.000000
1.000000
25%
312.250000
2.500000
6.750000
135.000000
3.250000
50%
425.500000
4.000000
12.000000
137.500000
4.500000
75%
499.000000
4.000000
20.250000
147.500000
5.000000
max
537.000000
5.000000
24.000000
205.000000
5.000000
Property type 2
count
2.000000
2.000000
2.000000
2.000000
2.000000
mean
126.000000
2.000000
20.000000
137.500000
3.500000
std
0.000000
0.000000
4.242641
17.677670
0.707107
min
126.000000
2.000000
17.000000
125.000000
3.000000
25%
126.000000
2.000000
18.500000
131.250000
3.250000
50%
126.000000
2.000000
20.000000
137.500000
3.500000
75%
126.000000
2.000000
21.500000
143.750000
3.750000
max
126.000000
2.000000
23.000000
150.000000
4.000000
Neighborhood 11
Property type 1
count
14.000000
14.000000
14.000000
14.000000
14.000000
mean
379.000000
3.214286
9.928571
159.428571
9.642857
std
396.956111
1.311404
5.928605
70.962302
5.212517
min
82.000000
2.000000
5.000000
95.000000
1.000000
25%
242.500000
2.000000
6.000000
103.750000
6.000000
50%
295.500000
3.000000
8.000000
130.000000
9.500000
...
...
...
...
...
...
...
...
Neighborhood 8
Property type 1
std
247.359388
1.154701
9.416298
50.301590
2.753785
min
72.000000
4.000000
5.000000
145.000000
4.000000
25%
121.500000
4.000000
5.750000
148.750000
4.750000
50%
256.000000
5.000000
7.000000
152.000000
6.500000
75%
434.500000
6.000000
12.250000
178.000000
8.500000
max
616.000000
6.000000
25.000000
250.000000
10.000000
Property type 2
count
1.000000
1.000000
1.000000
1.000000
1.000000
mean
223.000000
4.000000
5.000000
350.000000
3.000000
std
NaN
NaN
NaN
NaN
NaN
min
223.000000
4.000000
5.000000
350.000000
3.000000
25%
223.000000
4.000000
5.000000
350.000000
3.000000
50%
223.000000
4.000000
5.000000
350.000000
3.000000
75%
223.000000
4.000000
5.000000
350.000000
3.000000
max
223.000000
4.000000
5.000000
350.000000
3.000000
Neighborhood 9
Property type 1
count
7.000000
7.000000
7.000000
7.000000
7.000000
mean
471.428571
4.285714
13.428571
151.142857
5.714286
std
539.562124
2.690371
12.190941
137.914742
4.608481
min
153.000000
2.000000
4.000000
69.000000
1.000000
25%
204.000000
3.000000
6.000000
80.000000
2.500000
50%
247.000000
3.000000
10.000000
95.000000
3.000000
75%
421.500000
4.500000
14.500000
142.000000
9.000000
max
1649.000000
10.000000
39.000000
450.000000
13.000000
Property type 2
count
2.000000
2.000000
2.000000
2.000000
2.000000
mean
114.500000
2.000000
3.500000
110.000000
9.000000
std
137.885822
1.414214
0.707107
14.142136
2.828427
min
17.000000
1.000000
3.000000
100.000000
7.000000
25%
65.750000
1.500000
3.250000
105.000000
8.000000
50%
114.500000
2.000000
3.500000
110.000000
9.000000
75%
163.250000
2.500000
3.750000
115.000000
10.000000
max
212.000000
3.000000
4.000000
120.000000
11.000000
320 rows × 5 columns
In [67]:
bookings.groupby(['booking_date']).count().plot(linewidth=1)
Out[67]:
<matplotlib.axes._subplots.AxesSubplot at 0x10fefda90>
In [68]:
bl = bookings.merge(listings[['prop_id','neighborhood']], on = 'prop_id')
bl_count = bl.groupby(['neighborhood','booking_date']).count()
bl_count.unstack(0).plot(kind='area', legend=True, linewidth=1)
Out[68]:
<matplotlib.axes._subplots.AxesSubplot at 0x10fbd29d0>
In [69]:
# Let me flatten the bl_count so that I can just index on 'neighborhood'
bl_count_flat = bl_count.reset_index()
bl_count_flat
Out[69]:
neighborhood
booking_date
prop_id
0
Neighborhood 1
2011-01-27
2
1
Neighborhood 1
2011-02-03
1
2
Neighborhood 1
2011-03-24
1
3
Neighborhood 1
2011-04-17
1
4
Neighborhood 1
2011-05-12
1
5
Neighborhood 1
2011-07-02
1
6
Neighborhood 1
2011-07-04
1
7
Neighborhood 1
2011-07-26
1
8
Neighborhood 1
2011-08-15
1
9
Neighborhood 1
2011-08-17
1
10
Neighborhood 1
2011-08-31
1
11
Neighborhood 1
2011-09-23
1
12
Neighborhood 1
2011-11-15
1
13
Neighborhood 1
2011-11-21
1
14
Neighborhood 1
2011-12-07
1
15
Neighborhood 10
2011-01-17
1
16
Neighborhood 10
2011-02-02
1
17
Neighborhood 10
2011-02-21
1
18
Neighborhood 10
2011-02-23
1
19
Neighborhood 10
2011-03-03
1
20
Neighborhood 10
2011-03-29
1
21
Neighborhood 10
2011-04-04
1
22
Neighborhood 10
2011-04-17
1
23
Neighborhood 10
2011-04-19
1
24
Neighborhood 10
2011-04-28
1
25
Neighborhood 10
2011-05-06
1
26
Neighborhood 10
2011-05-10
1
27
Neighborhood 10
2011-05-11
1
28
Neighborhood 10
2011-05-13
1
29
Neighborhood 10
2011-06-08
1
...
...
...
...
2752
Neighborhood 9
2011-08-29
1
2753
Neighborhood 9
2011-08-30
1
2754
Neighborhood 9
2011-09-01
1
2755
Neighborhood 9
2011-09-02
1
2756
Neighborhood 9
2011-09-05
1
2757
Neighborhood 9
2011-09-08
1
2758
Neighborhood 9
2011-09-13
3
2759
Neighborhood 9
2011-09-22
1
2760
Neighborhood 9
2011-09-26
2
2761
Neighborhood 9
2011-09-28
1
2762
Neighborhood 9
2011-09-30
2
2763
Neighborhood 9
2011-10-01
1
2764
Neighborhood 9
2011-10-04
1
2765
Neighborhood 9
2011-10-09
2
2766
Neighborhood 9
2011-10-16
1
2767
Neighborhood 9
2011-10-31
3
2768
Neighborhood 9
2011-11-05
1
2769
Neighborhood 9
2011-11-06
1
2770
Neighborhood 9
2011-11-14
1
2771
Neighborhood 9
2011-11-15
1
2772
Neighborhood 9
2011-11-18
1
2773
Neighborhood 9
2011-11-21
1
2774
Neighborhood 9
2011-11-29
1
2775
Neighborhood 9
2011-11-30
1
2776
Neighborhood 9
2011-12-01
1
2777
Neighborhood 9
2011-12-02
1
2778
Neighborhood 9
2011-12-05
1
2779
Neighborhood 9
2011-12-13
2
2780
Neighborhood 9
2011-12-26
1
2781
Neighborhood 9
2011-12-29
1
2782 rows × 3 columns
In [70]:
#create a figure with 7 rows and 3 Columns with Fig size of 20 (on x axis) and 30 on y axis.
fig, ax = plt.subplots(7, 3, figsize=(20, 30))
# keep track of the plt_count
plt_count = 0
#group on just the Neighborhood. Then plot each Neighborhood on a separate plot
bl_count_group_nhood = bl_count_flat.groupby('neighborhood')
bl_count_group_nhood
for key, val in bl_count_group_nhood:
row = plt_count / 3
col = plt_count - (row * 3)
ax[row, col].plot(val.booking_date, val.prop_id)
ax[row, col].set_title(key)
plt_count += 1
In [71]:
listings.tail()
Out[71]:
prop_id
prop_type
neighborhood
price
person_capacity
picture_count
description_length
tenure_months
403
404
Property type 2
Neighborhood 14
100
1
8
235
1
404
405
Property type 2
Neighborhood 13
85
2
27
1048
1
405
406
Property type 1
Neighborhood 9
70
3
18
153
1
406
407
Property type 1
Neighborhood 13
129
2
13
370
1
407
408
Property type 1
Neighborhood 14
100
3
21
707
1
In [72]:
listings2 = listings.merge(bookings.groupby(['prop_id']).count().reset_index(), on= 'prop_id')
listings2.rename(columns={'booking_date': 'number_of_bookings'}, inplace=True)
listings2['booking_rate']= 0
listings2['tenure_months']=listings2['tenure_months'].astype('float')
def book_rte(x):
return x['number_of_bookings']/x['tenure_months']
listings2['booking_rate']=listings2.apply(book_rte, axis=1)
listings2
Out[72]:
prop_id
prop_type
neighborhood
price
person_capacity
picture_count
description_length
tenure_months
number_of_bookings
booking_rate
0
1
Property type 1
Neighborhood 14
140
3
11
232
30
4
0.133333
1
3
Property type 2
Neighborhood 16
95
2
16
172
29
1
0.034483
2
4
Property type 2
Neighborhood 13
90
2
19
472
28
27
0.964286
3
6
Property type 2
Neighborhood 13
89
2
10
886
28
88
3.142857
4
7
Property type 2
Neighborhood 13
85
1
11
58
24
2
0.083333
5
8
Property type 1
Neighborhood 18
120
2
13
685
23
28
1.217391
6
9
Property type 1
Neighborhood 13
210
6
27
180
23
3
0.130435
7
10
Property type 3
Neighborhood 17
65
2
15
189
23
26
1.130435
8
11
Property type 1
Neighborhood 13
145
3
9
140
22
7
0.318182
9
12
Property type 2
Neighborhood 13
89
2
11
153
22
45
2.045455
10
13
Property type 2
Neighborhood 14
96
2
10
245
20
57
2.850000
11
14
Property type 2
Neighborhood 17
95
2
8
139
20
31
1.550000
12
15
Property type 2
Neighborhood 11
95
2
15
255
19
12
0.631579
13
16
Property type 2
Neighborhood 14
95
2
39
334
19
44
2.315789
14
17
Property type 2
Neighborhood 12
65
2
7
333
19
7
0.368421
15
19
Property type 2
Neighborhood 12
65
2
9
448
19
1
0.052632
16
20
Property type 3
Neighborhood 4
40
2
4
241
19
9
0.473684
17
21
Property type 1
Neighborhood 13
295
5
22
228
18
9
0.500000
18
23
Property type 2
Neighborhood 14
90
3
15
411
18
52
2.888889
19
24
Property type 1
Neighborhood 16
110
2
10
495
17
26
1.529412
20
25
Property type 1
Neighborhood 18
215
2
16
190
17
3
0.176471
21
26
Property type 1
Neighborhood 14
139
2
20
395
17
2
0.117647
22
27
Property type 1
Neighborhood 15
180
6
26
325
17
68
4.000000
23
28
Property type 2
Neighborhood 12
95
2
8
137
16
8
0.500000
24
29
Property type 1
Neighborhood 17
125
3
31
327
16
2
0.125000
25
30
Property type 2
Neighborhood 17
90
3
5
582
16
9
0.562500
26
33
Property type 1
Neighborhood 13
246
8
10
437
16
1
0.062500
27
35
Property type 1
Neighborhood 13
170
4
39
404
16
1
0.062500
28
43
Property type 1
Neighborhood 12
229
4
6
281
16
3
0.187500
29
44
Property type 1
Neighborhood 19
500
4
6
342
16
2
0.125000
...
...
...
...
...
...
...
...
...
...
...
298
375
Property type 1
Neighborhood 15
114
2
12
213
2
2
1.000000
299
376
Property type 2
Neighborhood 16
99
2
10
0
2
1
0.500000
300
377
Property type 2
Neighborhood 13
90
2
44
123
2
10
5.000000
301
378
Property type 1
Neighborhood 15
90
3
3
288
2
14
7.000000
302
379
Property type 1
Neighborhood 18
79
2
5
348
2
3
1.500000
303
380
Property type 1
Neighborhood 14
145
4
9
253
2
33
16.500000
304
381
Property type 1
Neighborhood 16
159
3
17
206
2
23
11.500000
305
382
Property type 1
Neighborhood 19
120
4
19
203
2
4
2.000000
306
384
Property type 1
Neighborhood 14
425
6
13
449
2
4
2.000000
307
385
Property type 1
Neighborhood 13
110
4
22
464
1
9
9.000000
308
386
Property type 1
Neighborhood 14
97
2
5
195
1
52
52.000000
309
387
Property type 1
Neighborhood 14
125
2
6
425
1
43
43.000000
310
389
Property type 2
Neighborhood 17
39
1
21
247
1
8
8.000000
311
390
Property type 1
Neighborhood 12
150
2
9
141
1
10
10.000000
312
391
Property type 1
Neighborhood 14
95
2
8
217
1
33
33.000000
313
392
Property type 2
Neighborhood 17
105
2
26
224
1
18
18.000000
314
393
Property type 1
Neighborhood 19
319
6
8
213
1
16
16.000000
315
394
Property type 2
Neighborhood 15
190
4
12
179
1
6
6.000000
316
395
Property type 1
Neighborhood 15
120
2
5
192
1
12
12.000000
317
397
Property type 1
Neighborhood 13
139
4
13
303
1
24
24.000000
318
398
Property type 1
Neighborhood 12
60
2
10
102
1
1
1.000000
319
400
Property type 1
Neighborhood 10
90
2
23
523
1
9
9.000000
320
401
Property type 1
Neighborhood 11
150
4
8
82
1
2
2.000000
321
402
Property type 1
Neighborhood 7
300
4
19
550
1
5
5.000000
322
403
Property type 1
Neighborhood 19
100
4
2
69
1
1
1.000000
323
404
Property type 2
Neighborhood 14
100
1
8
235
1
3
3.000000
324
405
Property type 2
Neighborhood 13
85
2
27
1048
1
19
19.000000
325
406
Property type 1
Neighborhood 9
70
3
18
153
1
19
19.000000
326
407
Property type 1
Neighborhood 13
129
2
13
370
1
15
15.000000
327
408
Property type 1
Neighborhood 14
100
3
21
707
1
54
54.000000
328 rows × 10 columns
In [73]:
listings3 = listings2[listings2.tenure_months >= 10]
listings3
Out[73]:
prop_id
prop_type
neighborhood
price
person_capacity
picture_count
description_length
tenure_months
number_of_bookings
booking_rate
0
1
Property type 1
Neighborhood 14
140
3
11
232
30
4
0.133333
1
3
Property type 2
Neighborhood 16
95
2
16
172
29
1
0.034483
2
4
Property type 2
Neighborhood 13
90
2
19
472
28
27
0.964286
3
6
Property type 2
Neighborhood 13
89
2
10
886
28
88
3.142857
4
7
Property type 2
Neighborhood 13
85
1
11
58
24
2
0.083333
5
8
Property type 1
Neighborhood 18
120
2
13
685
23
28
1.217391
6
9
Property type 1
Neighborhood 13
210
6
27
180
23
3
0.130435
7
10
Property type 3
Neighborhood 17
65
2
15
189
23
26
1.130435
8
11
Property type 1
Neighborhood 13
145
3
9
140
22
7
0.318182
9
12
Property type 2
Neighborhood 13
89
2
11
153
22
45
2.045455
10
13
Property type 2
Neighborhood 14
96
2
10
245
20
57
2.850000
11
14
Property type 2
Neighborhood 17
95
2
8
139
20
31
1.550000
12
15
Property type 2
Neighborhood 11
95
2
15
255
19
12
0.631579
13
16
Property type 2
Neighborhood 14
95
2
39
334
19
44
2.315789
14
17
Property type 2
Neighborhood 12
65
2
7
333
19
7
0.368421
15
19
Property type 2
Neighborhood 12
65
2
9
448
19
1
0.052632
16
20
Property type 3
Neighborhood 4
40
2
4
241
19
9
0.473684
17
21
Property type 1
Neighborhood 13
295
5
22
228
18
9
0.500000
18
23
Property type 2
Neighborhood 14
90
3
15
411
18
52
2.888889
19
24
Property type 1
Neighborhood 16
110
2
10
495
17
26
1.529412
20
25
Property type 1
Neighborhood 18
215
2
16
190
17
3
0.176471
21
26
Property type 1
Neighborhood 14
139
2
20
395
17
2
0.117647
22
27
Property type 1
Neighborhood 15
180
6
26
325
17
68
4.000000
23
28
Property type 2
Neighborhood 12
95
2
8
137
16
8
0.500000
24
29
Property type 1
Neighborhood 17
125
3
31
327
16
2
0.125000
25
30
Property type 2
Neighborhood 17
90
3
5
582
16
9
0.562500
26
33
Property type 1
Neighborhood 13
246
8
10
437
16
1
0.062500
27
35
Property type 1
Neighborhood 13
170
4
39
404
16
1
0.062500
28
43
Property type 1
Neighborhood 12
229
4
6
281
16
3
0.187500
29
44
Property type 1
Neighborhood 19
500
4
6
342
16
2
0.125000
...
...
...
...
...
...
...
...
...
...
...
83
110
Property type 1
Neighborhood 20
350
3
6
135
12
2
0.166667
84
111
Property type 2
Neighborhood 14
85
2
33
248
12
3
0.250000
85
112
Property type 1
Neighborhood 14
129
2
39
759
12
37
3.083333
86
113
Property type 2
Neighborhood 13
50
2
5
514
12
24
2.000000
87
114
Property type 1
Neighborhood 13
325
4
46
227
11
3
0.272727
88
115
Property type 1
Neighborhood 13
180
3
18
256
11
8
0.727273
89
117
Property type 2
Neighborhood 14
49
2
14
417
11
8
0.727273
90
118
Property type 2
Neighborhood 4
60
2
10
95
11
11
1.000000
91
119
Property type 2
Neighborhood 12
55
2
8
333
11
1
0.090909
92
121
Property type 2
Neighborhood 15
69
3
15
418
10
33
3.300000
93
122
Property type 1
Neighborhood 18
125
2
24
756
10
38
3.800000
94
123
Property type 2
Neighborhood 13
63
1
6
575
10
32
3.200000
95
124
Property type 1
Neighborhood 12
135
2
19
538
10
36
3.600000
96
125
Property type 1
Neighborhood 14
140
2
12
196
10
4
0.400000
97
126
Property type 1
Neighborhood 12
245
5
17
282
10
30
3.000000
98
127
Property type 2
Neighborhood 19
85
2
16
192
10
41
4.100000
99
128
Property type 1
Neighborhood 17
120
2
15
352
10
34
3.400000
100
129
Property type 1
Neighborhood 13
400
8
18
321
10
19
1.900000
101
130
Property type 2
Neighborhood 18
95
6
17
334
10
67
6.700000
102
132
Property type 1
Neighborhood 9
69
2
4
211
10
20
2.000000
103
133
Property type 1
Neighborhood 14
110
3
29
437
10
22
2.200000
104
135
Property type 1
Neighborhood 11
100
3
23
123
10
37
3.700000
105
136
Property type 2
Neighborhood 16
99
2
20
143
10
46
4.600000
106
137
Property type 1
Neighborhood 8
250
6
8
138
10
1
0.100000
107
138
Property type 1
Neighborhood 11
120
2
8
193
10
37
3.700000
108
139
Property type 2
Neighborhood 19
85
3
35
852
10
39
3.900000
109
140
Property type 1
Neighborhood 12
200
4
18
125
10
10
1.000000
110
141
Property type 2
Neighborhood 12
45
2
36
281
10
1
0.100000
111
142
Property type 2
Neighborhood 15
96
2
9
138
10
48
4.800000
112
143
Property type 2
Neighborhood 15
58
2
7
135
10
21
2.100000
113 rows × 10 columns
prop_type and neighborhood are categorical variables, use get_dummies() (http://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.reshape.get_dummies.html) to transform this column of categorical data to many columns of boolean values (after applying this function correctly there should be 1 column for every prop_type and 1 column for every neighborhood category.
In [74]:
listings4 = pd.get_dummies(listings3)
listings4
Out[74]:
prop_id
price
person_capacity
picture_count
description_length
tenure_months
number_of_bookings
booking_rate
prop_type_Property type 1
prop_type_Property type 2
...
neighborhood_Neighborhood 15
neighborhood_Neighborhood 16
neighborhood_Neighborhood 17
neighborhood_Neighborhood 18
neighborhood_Neighborhood 19
neighborhood_Neighborhood 20
neighborhood_Neighborhood 21
neighborhood_Neighborhood 4
neighborhood_Neighborhood 8
neighborhood_Neighborhood 9
0
1
140
3
11
232
30
4
0.133333
1
0
...
0
0
0
0
0
0
0
0
0
0
1
3
95
2
16
172
29
1
0.034483
0
1
...
0
1
0
0
0
0
0
0
0
0
2
4
90
2
19
472
28
27
0.964286
0
1
...
0
0
0
0
0
0
0
0
0
0
3
6
89
2
10
886
28
88
3.142857
0
1
...
0
0
0
0
0
0
0
0
0
0
4
7
85
1
11
58
24
2
0.083333
0
1
...
0
0
0
0
0
0
0
0
0
0
5
8
120
2
13
685
23
28
1.217391
1
0
...
0
0
0
1
0
0
0
0
0
0
6
9
210
6
27
180
23
3
0.130435
1
0
...
0
0
0
0
0
0
0
0
0
0
7
10
65
2
15
189
23
26
1.130435
0
0
...
0
0
1
0
0
0
0
0
0
0
8
11
145
3
9
140
22
7
0.318182
1
0
...
0
0
0
0
0
0
0
0
0
0
9
12
89
2
11
153
22
45
2.045455
0
1
...
0
0
0
0
0
0
0
0
0
0
10
13
96
2
10
245
20
57
2.850000
0
1
...
0
0
0
0
0
0
0
0
0
0
11
14
95
2
8
139
20
31
1.550000
0
1
...
0
0
1
0
0
0
0
0
0
0
12
15
95
2
15
255
19
12
0.631579
0
1
...
0
0
0
0
0
0
0
0
0
0
13
16
95
2
39
334
19
44
2.315789
0
1
...
0
0
0
0
0
0
0
0
0
0
14
17
65
2
7
333
19
7
0.368421
0
1
...
0
0
0
0
0
0
0
0
0
0
15
19
65
2
9
448
19
1
0.052632
0
1
...
0
0
0
0
0
0
0
0
0
0
16
20
40
2
4
241
19
9
0.473684
0
0
...
0
0
0
0
0
0
0
1
0
0
17
21
295
5
22
228
18
9
0.500000
1
0
...
0
0
0
0
0
0
0
0
0
0
18
23
90
3
15
411
18
52
2.888889
0
1
...
0
0
0
0
0
0
0
0
0
0
19
24
110
2
10
495
17
26
1.529412
1
0
...
0
1
0
0
0
0
0
0
0
0
20
25
215
2
16
190
17
3
0.176471
1
0
...
0
0
0
1
0
0
0
0
0
0
21
26
139
2
20
395
17
2
0.117647
1
0
...
0
0
0
0
0
0
0
0
0
0
22
27
180
6
26
325
17
68
4.000000
1
0
...
1
0
0
0
0
0
0
0
0
0
23
28
95
2
8
137
16
8
0.500000
0
1
...
0
0
0
0
0
0
0
0
0
0
24
29
125
3
31
327
16
2
0.125000
1
0
...
0
0
1
0
0
0
0
0
0
0
25
30
90
3
5
582
16
9
0.562500
0
1
...
0
0
1
0
0
0
0
0
0
0
26
33
246
8
10
437
16
1
0.062500
1
0
...
0
0
0
0
0
0
0
0
0
0
27
35
170
4
39
404
16
1
0.062500
1
0
...
0
0
0
0
0
0
0
0
0
0
28
43
229
4
6
281
16
3
0.187500
1
0
...
0
0
0
0
0
0
0
0
0
0
29
44
500
4
6
342
16
2
0.125000
1
0
...
0
0
0
0
1
0
0
0
0
0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
83
110
350
3
6
135
12
2
0.166667
1
0
...
0
0
0
0
0
1
0
0
0
0
84
111
85
2
33
248
12
3
0.250000
0
1
...
0
0
0
0
0
0
0
0
0
0
85
112
129
2
39
759
12
37
3.083333
1
0
...
0
0
0
0
0
0
0
0
0
0
86
113
50
2
5
514
12
24
2.000000
0
1
...
0
0
0
0
0
0
0
0
0
0
87
114
325
4
46
227
11
3
0.272727
1
0
...
0
0
0
0
0
0
0
0
0
0
88
115
180
3
18
256
11
8
0.727273
1
0
...
0
0
0
0
0
0
0
0
0
0
89
117
49
2
14
417
11
8
0.727273
0
1
...
0
0
0
0
0
0
0
0
0
0
90
118
60
2
10
95
11
11
1.000000
0
1
...
0
0
0
0
0
0
0
1
0
0
91
119
55
2
8
333
11
1
0.090909
0
1
...
0
0
0
0
0
0
0
0
0
0
92
121
69
3
15
418
10
33
3.300000
0
1
...
1
0
0
0
0
0
0
0
0
0
93
122
125
2
24
756
10
38
3.800000
1
0
...
0
0
0
1
0
0
0
0
0
0
94
123
63
1
6
575
10
32
3.200000
0
1
...
0
0
0
0
0
0
0
0
0
0
95
124
135
2
19
538
10
36
3.600000
1
0
...
0
0
0
0
0
0
0
0
0
0
96
125
140
2
12
196
10
4
0.400000
1
0
...
0
0
0
0
0
0
0
0
0
0
97
126
245
5
17
282
10
30
3.000000
1
0
...
0
0
0
0
0
0
0
0
0
0
98
127
85
2
16
192
10
41
4.100000
0
1
...
0
0
0
0
1
0
0
0
0
0
99
128
120
2
15
352
10
34
3.400000
1
0
...
0
0
1
0
0
0
0
0
0
0
100
129
400
8
18
321
10
19
1.900000
1
0
...
0
0
0
0
0
0
0
0
0
0
101
130
95
6
17
334
10
67
6.700000
0
1
...
0
0
0
1
0
0
0
0
0
0
102
132
69
2
4
211
10
20
2.000000
1
0
...
0
0
0
0
0
0
0
0
0
1
103
133
110
3
29
437
10
22
2.200000
1
0
...
0
0
0
0
0
0
0
0
0
0
104
135
100
3
23
123
10
37
3.700000
1
0
...
0
0
0
0
0
0
0
0
0
0
105
136
99
2
20
143
10
46
4.600000
0
1
...
0
1
0
0
0
0
0
0
0
0
106
137
250
6
8
138
10
1
0.100000
1
0
...
0
0
0
0
0
0
0
0
1
0
107
138
120
2
8
193
10
37
3.700000
1
0
...
0
0
0
0
0
0
0
0
0
0
108
139
85
3
35
852
10
39
3.900000
0
1
...
0
0
0
0
1
0
0
0
0
0
109
140
200
4
18
125
10
10
1.000000
1
0
...
0
0
0
0
0
0
0
0
0
0
110
141
45
2
36
281
10
1
0.100000
0
1
...
0
0
0
0
0
0
0
0
0
0
111
142
96
2
9
138
10
48
4.800000
0
1
...
1
0
0
0
0
0
0
0
0
0
112
143
58
2
7
135
10
21
2.100000
0
1
...
1
0
0
0
0
0
0
0
0
0
113 rows × 25 columns
In [75]:
X = listings4.drop(['prop_id','booking_rate','number_of_bookings'], axis=1)
y = listings4['booking_rate']
predictor (y) is booking_rate, regressors (X) are everything else, except prop_id,booking_rate,prop_type,neighborhood and number_of_bookings
http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.train_test_split.html
http://pandas.pydata.org/pandas-docs/stable/basics.html#dropping-labels-from-an-axis
In [76]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 12)
In [77]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
In [78]:
lr.fit(X_train,y_train)
Out[78]:
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
In [91]:
lr.score(X_test,y_test)
Out[91]:
0.41248958759380083
In [89]:
X_train.shape
Out[89]:
(90, 22)
In [80]:
listings5 = listings4
listings5['monthly_revenue'] = 0
def monthly_rev(x):
return x['number_of_bookings']*x['price']/x['tenure_months']
listings5['monthly_revenue']=listings5.apply(monthly_rev, axis=1)
listings5
Out[80]:
(90, 22)
In [ ]:
Content source: ga-students/DAT_SF_11_homework
Similar notebooks: