In [1]:
%matplotlib inline
Please download the Airbnb data from here
http://data.insideairbnb.com/united-states/ny/new-york-city/2017-09-02/data/listings.csv.gz
and unzip it in the current folder.
In [2]:
from __future__ import print_function
from __future__ import division
import logging
import csv
import numpy as np
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')
text = []
outcome = []
meta = []
with open("listings.csv", "r") as f_in:
for row in csv.DictReader(f_in):
v = int(row["review_scores_rating"]) if row["review_scores_rating"] else np.nan
if len(row["description"].strip()) > 10:
text.append("{0} {1}".format(row["name"], row["description"]))
outcome.append(v)
meta.append(row["neighbourhood_cleansed"])
outcome = np.array(outcome)
nanmin = np.nanmin(outcome)
print("nanmin", nanmin, "nans", np.count_nonzero(np.isnan(outcome)))
outcome[np.isnan(outcome)] = nanmin
label = outcome > np.median(outcome)
("min", outcome.min()), \
("med", np.median(outcome)), \
("max", outcome.max()), \
("mean", outcome.mean()), \
("std", outcome.std()), \
("count", outcome.shape[0])
Out[2]:
In [3]:
from pprint import pprint
from time import monotonic
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(min_df=10, max_df=0.9, stop_words="english", ngram_range=(1, 3), max_features=3000)
t0 = monotonic()
data = cv.fit_transform(text)
features = cv.get_feature_names()
print("done in %0.3fs" % (monotonic() - t0))
np.count_nonzero(data.toarray()) / data.shape[0], data.shape, len(features)
Out[3]:
In [4]:
data.toarray()
Out[4]:
In [5]:
features[:10]
Out[5]:
In [6]:
if "label" in features:
raise ValueError("label in features")
print("start")
real_features = [ "label" ] + features
tr = 0.2
# no meta :(
mat = data.toarray()
with open("airbnb.csr", "w") as f_out:
out = csv.writer(f_out)
out.writerow(real_features)
for rix in range(mat.shape[0]):
cixs = []
for cix in range(mat.shape[1]):
if mat[rix, cix] > 0:
cixs.append(cix)
out.writerow([ 1 if label[rix] else 0 ] + cixs)
if rix % 5000 == 0:
print(rix)
print("done")
In [7]:
import zipfile
with zipfile.ZipFile("airbnb.zip", 'w', zipfile.ZIP_LZMA) as zf:
zf.write("airbnb.csr")
In [ ]: