In [1]:

    
%matplotlib inline

Airbnb

Please download the Airbnb data from here

http://data.insideairbnb.com/united-states/ny/new-york-city/2017-09-02/data/listings.csv.gz

and unzip it in the current folder.



In [2]:

    
from __future__ import print_function
from __future__ import division

import logging
import csv

import numpy as np

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

text = []
outcome = []
meta = []
with open("listings.csv", "r") as f_in:
    for row in csv.DictReader(f_in):
        v = int(row["review_scores_rating"]) if row["review_scores_rating"] else np.nan
        if len(row["description"].strip()) > 10:
            text.append("{0} {1}".format(row["name"], row["description"]))
            outcome.append(v)
            meta.append(row["neighbourhood_cleansed"])

outcome = np.array(outcome)
nanmin = np.nanmin(outcome)
print("nanmin", nanmin, "nans", np.count_nonzero(np.isnan(outcome)))
outcome[np.isnan(outcome)] = nanmin
label = outcome > np.median(outcome)
("min", outcome.min()), \
("med", np.median(outcome)), \
("max", outcome.max()), \
("mean", outcome.mean()), \
("std", outcome.std()), \
("count", outcome.shape[0])









    



nanmin 20.0 nans 8841






    Out[2]:





(('min', 20.0),
 ('med', 93.0),
 ('max', 100.0),
 ('mean', 77.6177515080755),
 ('std', 31.000761076404899),
 ('count', 41112))



In [3]:

    
from pprint import pprint
from time import monotonic

from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(min_df=10, max_df=0.9, stop_words="english", ngram_range=(1, 3), max_features=3000)
t0 = monotonic()
data = cv.fit_transform(text)
features = cv.get_feature_names()
print("done in %0.3fs" % (monotonic() - t0))

np.count_nonzero(data.toarray()) / data.shape[0], data.shape, len(features)









    



done in 24.424s






    Out[3]:





(72.79623954076669, (41112, 3000), 3000)



In [4]:

    
data.toarray()









    Out[4]:





array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.19942624,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])



In [5]:

    
features[:10]









    Out[5]:





['00',
 '10',
 '10 15',
 '10 min',
 '10 min walk',
 '10 mins',
 '10 minute',
 '10 minute walk',
 '10 minutes',
 '10 minutes away']



In [6]:

    
if "label" in features:
    raise ValueError("label in features")
print("start")
real_features = [ "label" ] + features
tr = 0.2
# no meta :(
mat = data.toarray()
with open("airbnb.csr", "w") as f_out:
    out = csv.writer(f_out)
    out.writerow(real_features)
    for rix in range(mat.shape[0]):
        cixs = []
        for cix in range(mat.shape[1]):
            if mat[rix, cix] > 0:
                cixs.append(cix)
        out.writerow([ 1 if label[rix] else 0 ] + cixs)
        if rix % 5000 == 0:
            print(rix)
print("done")



In [7]:

    
import zipfile

with zipfile.ZipFile("airbnb.zip", 'w', zipfile.ZIP_LZMA) as zf:
    zf.write("airbnb.csr")



In [ ]: