In [1]:
%matplotlib inline

Airbnb

Please download the Airbnb data from here

http://data.insideairbnb.com/united-states/ny/new-york-city/2017-09-02/data/listings.csv.gz

and unzip it in the current folder.


In [2]:
from __future__ import print_function
from __future__ import division

import logging
import csv

import numpy as np

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

text = []
outcome = []
meta = []
with open("listings.csv", "r") as f_in:
    for row in csv.DictReader(f_in):
        v = int(row["review_scores_rating"]) if row["review_scores_rating"] else np.nan
        if len(row["description"].strip()) > 10:
            text.append("{0} {1}".format(row["name"], row["description"]))
            outcome.append(v)
            meta.append(row["neighbourhood_cleansed"])

outcome = np.array(outcome)
nanmin = np.nanmin(outcome)
print("nanmin", nanmin, "nans", np.count_nonzero(np.isnan(outcome)))
outcome[np.isnan(outcome)] = nanmin
label = outcome > np.median(outcome)
("min", outcome.min()), \
("med", np.median(outcome)), \
("max", outcome.max()), \
("mean", outcome.mean()), \
("std", outcome.std()), \
("count", outcome.shape[0])


nanmin 20.0 nans 8841
Out[2]:
(('min', 20.0),
 ('med', 93.0),
 ('max', 100.0),
 ('mean', 77.6177515080755),
 ('std', 31.000761076404899),
 ('count', 41112))

In [3]:
from pprint import pprint
from time import monotonic

from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(min_df=10, max_df=0.9, stop_words="english", ngram_range=(1, 3), max_features=3000)
t0 = monotonic()
data = cv.fit_transform(text)
features = cv.get_feature_names()
print("done in %0.3fs" % (monotonic() - t0))

np.count_nonzero(data.toarray()) / data.shape[0], data.shape, len(features)


done in 24.424s
Out[3]:
(72.79623954076669, (41112, 3000), 3000)

In [4]:
data.toarray()


Out[4]:
array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.19942624,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [5]:
features[:10]


Out[5]:
['00',
 '10',
 '10 15',
 '10 min',
 '10 min walk',
 '10 mins',
 '10 minute',
 '10 minute walk',
 '10 minutes',
 '10 minutes away']

In [6]:
if "label" in features:
    raise ValueError("label in features")
print("start")
real_features = [ "label" ] + features
tr = 0.2
# no meta :(
mat = data.toarray()
with open("airbnb.csr", "w") as f_out:
    out = csv.writer(f_out)
    out.writerow(real_features)
    for rix in range(mat.shape[0]):
        cixs = []
        for cix in range(mat.shape[1]):
            if mat[rix, cix] > 0:
                cixs.append(cix)
        out.writerow([ 1 if label[rix] else 0 ] + cixs)
        if rix % 5000 == 0:
            print(rix)
print("done")


start
0
5000
10000
15000
20000
25000
30000
35000
40000
done

In [7]:
import zipfile

with zipfile.ZipFile("airbnb.zip", 'w', zipfile.ZIP_LZMA) as zf:
    zf.write("airbnb.csr")

In [ ]: