In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

from datetime import datetime

%matplotlib inline

In [2]:
ROOT_DIR = '../'



QML_DATA_DIR = ROOT_DIR + 'qml_workdir/data/'

QML_TRAIN_X_FILE_MASK = ROOT_DIR + 'qml_workdir/data/v{0:0=4d}_train_x.csv'
QML_TEST_X_FILE_MASK  = ROOT_DIR + 'qml_workdir/data/v{0:0=4d}_test_x.csv'
QML_TRAIN_Y_FILE_MASK = ROOT_DIR + 'qml_workdir/data/train_y.csv'

In [ ]:


In [9]:
train = pd.read_json(QML_DATA_DIR + "raw/train.json")
train.set_index('listing_id', inplace=True)
test = pd.read_json(QML_DATA_DIR + "raw/test.json")
test.set_index('listing_id', inplace=True)

train_y = pd.DataFrame(train['interest_level'])
train.drop(['interest_level'], axis=1, inplace=True)
all = pd.concat([train, test])


all.shape


Out[9]:
(124011, 13)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

all['features'] = all["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))

tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(all["features"])
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()

In [13]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()


Out[13]:
<1x200 sparse matrix of type '<class 'numpy.int64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [53]:
all.columns


Out[53]:
Index(['bathrooms', 'bedrooms', 'building_id', 'created', 'description',
       'display_address', 'features', 'interest_level', 'latitude',
       'longitude', 'manager_id', 'photos', 'price', 'street_address'],
      dtype='object')

In [49]:
train.drop(train.columns, axis=1).to_csv(QML_DATA_DIR + 'ids_train.csv')
test.drop(test.columns, axis=1).to_csv(QML_DATA_DIR + 'ids_test.csv')

In [104]:
#pd.get_dummies(train_y, prefix='', prefix_sep='')[['high', 'medium', 'low']].to_csv(QML_TRAIN_Y_FILE_MASK)
train_y.replace({'high':0, 'medium':1, 'low':2}).to_csv(QML_TRAIN_Y_FILE_MASK)

In [79]:
res = all[['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price']]

In [99]:



Out[99]:
(124011, 5)

In [83]:
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index

DATA_ID = 1

res.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
res.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))

In [107]:


In [110]:
res = all[['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price']]
res["num_photos"] = all["photos"].apply(len)
res["num_features"] = all["features"].apply(len)
res["num_description_words"] = all["description"].apply(lambda x: len(x.split(" ")))
all["created"] = pd.to_datetime(all["created"])
res["created_year"] = all["created"].dt.year
res["created_month"] = all["created"].dt.month
res["created_day"] = all["created"].dt.day
res["created_ts"] = (all["created"] - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')


D:\Python36\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
D:\Python36\lib\site-packages\ipykernel\__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
D:\Python36\lib\site-packages\ipykernel\__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\Python36\lib\site-packages\ipykernel\__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\Python36\lib\site-packages\ipykernel\__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\Python36\lib\site-packages\ipykernel\__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\Python36\lib\site-packages\ipykernel\__main__.py:9: DeprecationWarning: parsing timezone aware datetimes is deprecated; this will raise an error in the future
D:\Python36\lib\site-packages\ipykernel\__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [111]:
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index

DATA_ID = 2

res.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
res.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))

In [40]:


In [113]:
DATA_ID = 3
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(2), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(2), index_col='listing_id')

all = pd.concat([train, test])

all['coord_1'] = all['latitude'] + all['longitude']
all['coord_2'] = all['latitude'] - all['longitude']
all['coord_3'] = all['latitude'] * all['longitude']


res.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
res.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [117]:
DATA_ID = 4
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(3), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(3), index_col='listing_id')

all = pd.concat([train, test])

#train['created'] = pd.to_datetime(train['created'])

dt = all['created_ts'] * np.timedelta64(1, 's') + np.datetime64('1970-01-01T00:00:00Z')

all['created_hour'] = dt.dt.hour
all['created_weekday'] = dt.dt.weekday
all['created_week'] = dt.dt.week
all['created_quarter'] = dt.dt.quarter
all['created_weekend'] = ((all['created_weekday'] == 5) & (all['created_weekday'] == 6))
all['created_wd'] = ((all['created_weekday'] != 5) & (all['created_weekday'] != 6))


res.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
res.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))


D:\Python36\lib\site-packages\ipykernel\__main__.py:11: DeprecationWarning: parsing timezone aware datetimes is deprecated; this will raise an error in the future

In [ ]:


In [21]:
DATA_ID = 5
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(4), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(4), index_col='listing_id')
all = pd.concat([train, test])


raw1 = pd.read_json(QML_DATA_DIR + "raw/train.json")
raw1.set_index('listing_id', inplace=True)
raw2 = pd.read_json(QML_DATA_DIR + "raw/test.json")
raw2.set_index('listing_id', inplace=True)
raw = pd.concat([raw1, raw2])

repl_dict = raw1.groupby('manager_id')['manager_id'].count().sort_values(ascending=False).to_dict()
for i in raw['manager_id']:
    if i not in repl_dict:
        repl_dict[i] = 0
        
all['man_objects_count'] = raw['manager_id'].replace(repl_dict)

all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))

In [15]:


In [22]:
DATA_ID = 6
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(5), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(5), index_col='listing_id')
all = pd.concat([train, test])


raw1 = pd.read_json(QML_DATA_DIR + "raw/train.json")
raw1.set_index('listing_id', inplace=True)
raw2 = pd.read_json(QML_DATA_DIR + "raw/test.json")
raw2.set_index('listing_id', inplace=True)
raw = pd.concat([raw1, raw2])

repl_dict1 = raw1[raw1['interest_level']=='high'].groupby('manager_id')['manager_id'].count().sort_values(ascending=False).to_dict()
repl_dict2 = raw1[raw1['interest_level']=='medium'].groupby('manager_id')['manager_id'].count().sort_values(ascending=False).to_dict()
repl_dict3 = raw1[raw1['interest_level']=='low'].groupby('manager_id')['manager_id'].count().sort_values(ascending=False).to_dict()

for i in raw['manager_id']:
    if i not in repl_dict1:
        repl_dict1[i] = 0
    if i not in repl_dict2:
        repl_dict2[i] = 0
    if i not in repl_dict3:
        repl_dict3[i] = 0

all['man_objects_high'] = raw['manager_id'].replace(repl_dict1)
all['man_objects_medium'] = raw['manager_id'].replace(repl_dict2)
all['man_objects_low'] = raw['manager_id'].replace(repl_dict3)

all['man_objects_high_to_count'] = all['man_objects_high'] / (all['man_objects_count'] +1)
all['man_objects_medium_to_count'] = all['man_objects_medium'] / (all['man_objects_count'] +1)
all['man_objects_low_to_count'] = all['man_objects_low'] / (all['man_objects_count'] +1)


all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))

In [22]:


In [22]:
DATA_ID = 7
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(6), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(6), index_col='listing_id')
all = pd.concat([train, test])

all["pricePerBed"] = all['price'] / all['bedrooms']
all["pricePerBath"] = all['price'] / all['bathrooms']
all["pricePerRoom"] = all['price'] / (all['bedrooms'] + all['bathrooms'])
all["bedPerBath"] = all['bedrooms'] / all['bathrooms']
all["bedBathDiff"] = all['bedrooms'] - all['bathrooms']
all["bedBathSum"] = all["bedrooms"] + all['bathrooms']
all["bedsPerc"] = all["bedrooms"] / (all['bedrooms'] + all['bathrooms'])

all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))

In [22]:


In [27]:
DATA_ID = 8
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(7), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(7), index_col='listing_id')
all = pd.concat([train, test])

raw1 = pd.read_json(QML_DATA_DIR + "raw/train.json")
raw1.set_index('listing_id', inplace=True)
raw2 = pd.read_json(QML_DATA_DIR + "raw/test.json")
raw2.set_index('listing_id', inplace=True)
raw = pd.concat([raw1, raw2])

repl_dict = raw1.groupby('manager_id')['manager_id'].count().sort_values(ascending=False).to_dict()
for i in raw['manager_id']:
    if i not in repl_dict:
        repl_dict[i] = 0
        
all['man_objects_count'] = raw['manager_id'].replace(repl_dict)




all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))


D:\Python36\lib\site-packages\ipykernel\__main__.py:12: DeprecationWarning: parsing timezone aware datetimes is deprecated; this will raise an error in the future

In [28]:
DATA_ID = 9
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(8), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(8), index_col='listing_id')
all = pd.concat([train, test])



raw1 = pd.read_json(QML_DATA_DIR + "raw/train.json")
raw1.set_index('listing_id', inplace=True)
raw2 = pd.read_json(QML_DATA_DIR + "raw/test.json")
raw2.set_index('listing_id', inplace=True)
raw = pd.concat([raw1, raw2])

repl_dict = raw1.groupby('manager_id')['manager_id'].count().sort_values(ascending=False).to_dict()
repl_dict_mean = np.mean([x for x in repl_dict.values()])
for i in raw['manager_id']:
    if i not in repl_dict:
        repl_dict[i] = repl_dict_mean
        
all['man_objects_count'] = raw['manager_id'].replace(repl_dict)


repl_dict1 = raw1[raw1['interest_level']=='high'].groupby('manager_id')['manager_id'].count().sort_values(ascending=False).to_dict()
repl_dict2 = raw1[raw1['interest_level']=='medium'].groupby('manager_id')['manager_id'].count().sort_values(ascending=False).to_dict()
repl_dict3 = raw1[raw1['interest_level']=='low'].groupby('manager_id')['manager_id'].count().sort_values(ascending=False).to_dict()

repl_dict_mean1 = np.mean([x for x in repl_dict1.values()])
repl_dict_mean2 = np.mean([x for x in repl_dict2.values()])
repl_dict_mean3 = np.mean([x for x in repl_dict3.values()])

for i in raw['manager_id']:
    if i not in repl_dict1:
        repl_dict1[i] = repl_dict_mean1
    if i not in repl_dict2:
        repl_dict2[i] = repl_dict_mean2
    if i not in repl_dict3:
        repl_dict3[i] = repl_dict_mean3

all['man_objects_high'] = raw['manager_id'].replace(repl_dict1)
all['man_objects_medium'] = raw['manager_id'].replace(repl_dict2)
all['man_objects_low'] = raw['manager_id'].replace(repl_dict3)

all['man_objects_high_to_count'] = all['man_objects_high'] / (all['man_objects_count'] +1)
all['man_objects_medium_to_count'] = all['man_objects_medium'] / (all['man_objects_count'] +1)
all['man_objects_low_to_count'] = all['man_objects_low'] / (all['man_objects_count'] +1)




all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [5]:
DATA_ID = 10
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(8), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(8), index_col='listing_id')
all = pd.concat([train, test])



raw1 = pd.read_json(QML_DATA_DIR + "raw/train.json")
raw1.set_index('listing_id', inplace=True)
raw2 = pd.read_json(QML_DATA_DIR + "raw/test.json")
raw2.set_index('listing_id', inplace=True)
raw = pd.concat([raw1, raw2])

repl_dict = raw1.groupby('building_id')['building_id'].count().sort_values(ascending=False).to_dict()
for i in raw['building_id']:
    if i not in repl_dict:
        repl_dict[i] = 0
        
all['build_objects_count'] = raw['building_id'].replace(repl_dict)


repl_dict1 = raw1[raw1['interest_level']=='high'].groupby('building_id')['building_id'].count().sort_values(ascending=False).to_dict()
repl_dict2 = raw1[raw1['interest_level']=='medium'].groupby('building_id')['building_id'].count().sort_values(ascending=False).to_dict()
repl_dict3 = raw1[raw1['interest_level']=='low'].groupby('building_id')['building_id'].count().sort_values(ascending=False).to_dict()


for i in raw['building_id']:
    if i not in repl_dict1:
        repl_dict1[i] = 0
    if i not in repl_dict2:
        repl_dict2[i] = 0
    if i not in repl_dict3:
        repl_dict3[i] = 0

all['build_objects_high'] = raw['building_id'].replace(repl_dict1)
all['build_objects_medium'] = raw['building_id'].replace(repl_dict2)
all['build_objects_low'] = raw['building_id'].replace(repl_dict3)

all['build_objects_high_to_count'] = all['build_objects_high'] / (all['build_objects_count'] +1)
all['build_objects_medium_to_count'] = all['build_objects_medium'] / (all['build_objects_count'] +1)
all['build_objects_low_to_count'] = all['build_objects_low'] / (all['build_objects_count'] +1)




all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [13]:
DATA_ID = 11
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(10), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(10), index_col='listing_id')
all = pd.concat([train, test])



        
del all['build_objects_count']
del all['build_objects_high']## = raw['building_id'].replace(repl_dict1)
del all['build_objects_medium']# = raw['building_id'].replace(repl_dict2)
del all['build_objects_low']# = raw['building_id'].replace(repl_dict3)

del all['build_objects_high_to_count']# = all['build_objects_high'] / (all['build_objects_count'] +1)
del all['build_objects_medium_to_count']# = all['build_objects_medium'] / (all['build_objects_count'] +1)
del all['build_objects_low_to_count']# = all['build_objects_low'] / (all['build_objects_count'] +1)


        
del all['man_objects_count']
del all['man_objects_high']## = raw['maning_id'].replace(repl_dict1)
del all['man_objects_medium']# = raw['maning_id'].replace(repl_dict2)
del all['man_objects_low']# = raw['maning_id'].replace(repl_dict3)

del all['man_objects_high_to_count']# = all['man_objects_high'] / (all['man_objects_count'] +1)
del all['man_objects_medium_to_count']# = all['man_objects_medium'] / (all['man_objects_count'] +1)
del all['man_objects_low_to_count']# = all['man_objects_low'] / (all['man_objects_count'] +1)




all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [15]:
DATA_ID = 12
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(11), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(11), index_col='listing_id')
all = pd.concat([train, test])



        
all['listing_id'] = all.index.values


all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))

In [16]:
all.columns


Out[16]:
Index(['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price', 'num_photos',
       'num_features', 'num_description_words', 'created_year',
       'created_month', 'created_day', 'created_ts', 'pricePerBed',
       'pricePerBath', 'pricePerRoom', 'bedPerBath', 'bedBathDiff',
       'bedBathSum', 'bedsPerc', 'kazanova_ts', 'kazanova_year',
       'kazanova_month', 'kazanova_day', 'kazanova_hour', 'kazanova_weekday',
       'kazanova_week', 'kazanova_quarter', 'kazanova_weekend', 'kazanova_wd',
       'listing_id'],
      dtype='object')

In [25]:
DATA_ID = 13
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(12), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(12), index_col='listing_id')
all = pd.concat([train, test])

addr = {
    '145 28 Street' : (-73.99244,	40.74708),
    'Van Sicklen street' : (	-73.97504,	40.59679),
    '219 E 28th' : (	-73.97982,	40.74179),
    '1632 Madison Ave' : (	-73.94847,	40.79576),
    '41-42 24th St' : (	-73.94131,	40.75153),
    '450 East 83rd Street' : (	-73.94899,	40.77399),
    '247 west 87' : (	-73.97555,	40.78888),
    '118 W 109th' : (	-73.96273,	40.8015),
    '246 Mott St' : (	-73.99466,	40.72328),
    '21 W 106th' : (	-73.96095,	40.79874),
    '338 e. 53':(	-73.96576,	40.75591),
    '259 Decatur Street':(	-73.93344,	40.68165)
}

all['longitude'].replace({0:all['longitude'].mean()}, inplace=True)
all['latitude'].replace({0:all['latitude'].mean()}, inplace=True)


all['coord_1'] = all['latitude'] + all['longitude']
all['coord_2'] = all['latitude'] - all['longitude']
all['coord_3'] = all['latitude'] * all['longitude']




#ny_lat = 40.785091
#ny_lon = -73.968285

#all['dist_center'] = ((all['longitude'] - ny_lon)**2  + (all['latitude'] - ny_lat)**2)**0.5

all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))

In [17]:


In [26]:
DATA_ID = 14
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(13), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(13), index_col='listing_id')
all = pd.concat([train, test])


ny_lat = 40.785091
ny_lon = -73.968285

all['dist_center'] = ((all['longitude'] - ny_lon)**2  + (all['latitude'] - ny_lat)**2)**0.5

all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [60]:
DATA_ID = 15
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(14), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(14), index_col='listing_id')
all = pd.concat([train, test])


raw1 = pd.read_json(QML_DATA_DIR + "raw/train.json")
raw1.set_index('listing_id', inplace=True)
raw2 = pd.read_json(QML_DATA_DIR + "raw/test.json")
raw2.set_index('listing_id', inplace=True)
raw = pd.concat([raw1, raw2])

col = raw.groupby('manager_id')['manager_id'].count().sort_values(ascending=False)
col[200:] = None


all['man'] = raw['manager_id'].replace(col[200:].to_dict())
all = pd.get_dummies(all, columns=['man'])


all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))

In [59]:



Out[59]:
(124011, 34)

In [3]:
DATA_ID = 16
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(14), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(14), index_col='listing_id')
all = pd.concat([train, test])


raw1 = pd.read_json(QML_DATA_DIR + "raw/train.json")
raw1.set_index('listing_id', inplace=True)
raw2 = pd.read_json(QML_DATA_DIR + "raw/test.json")
raw2.set_index('listing_id', inplace=True)
raw = pd.concat([raw1, raw2])

col = raw.groupby('building_id')['building_id'].count().sort_values(ascending=False)
col[200:] = None

all['building'] = raw['building_id'].replace(col[200:].to_dict())
all = pd.get_dummies(all, columns=['building'])


all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [8]:
DATA_ID = 17
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(14), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(14), index_col='listing_id')
all = pd.concat([train, test])


raw1 = pd.read_json(QML_DATA_DIR + "raw/train.json")
raw1.set_index('listing_id', inplace=True)
raw2 = pd.read_json(QML_DATA_DIR + "raw/test.json")
raw2.set_index('listing_id', inplace=True)
raw = pd.concat([raw1, raw2])

col = raw.groupby('display_address')['display_address'].count().sort_values(ascending=False)
col[200:] = None

all['display_address'] = raw['display_address'].replace(col[200:].to_dict())
all = pd.get_dummies(all, columns=['display_address'])


all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [ ]:
DATA_ID = 18
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(14), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(14), index_col='listing_id')
all = pd.concat([train, test])


raw1 = pd.read_json(QML_DATA_DIR + "raw/train.json")
raw1.set_index('listing_id', inplace=True)
raw2 = pd.read_json(QML_DATA_DIR + "raw/test.json")
raw2.set_index('listing_id', inplace=True)
raw = pd.concat([raw1, raw2])

col = raw.groupby('display_address')['display_address'].count().sort_values(ascending=False)
col[200:] = None

all['display_address'] = raw['display_address'].replace(col[200:].to_dict())
all = pd.get_dummies(all, columns=['display_address'])


all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [ ]: