In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
from datetime import datetime
%matplotlib inline
In [2]:
ROOT_DIR = '../'
QML_DATA_DIR = ROOT_DIR + 'qml_workdir/data/'
QML_TRAIN_X_FILE_MASK = ROOT_DIR + 'qml_workdir/data/v{0:0=4d}_train_x.csv'
QML_TEST_X_FILE_MASK = ROOT_DIR + 'qml_workdir/data/v{0:0=4d}_test_x.csv'
QML_TRAIN_Y_FILE_MASK = ROOT_DIR + 'qml_workdir/data/train_y.csv'
In [ ]:
In [9]:
train = pd.read_json(QML_DATA_DIR + "raw/train.json")
train.set_index('listing_id', inplace=True)
test = pd.read_json(QML_DATA_DIR + "raw/test.json")
test.set_index('listing_id', inplace=True)
train_y = pd.DataFrame(train['interest_level'])
train.drop(['interest_level'], axis=1, inplace=True)
all = pd.concat([train, test])
all.shape
Out[9]:
In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
all['features'] = all["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(all["features"])
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
In [13]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
Out[13]:
In [53]:
all.columns
Out[53]:
In [49]:
train.drop(train.columns, axis=1).to_csv(QML_DATA_DIR + 'ids_train.csv')
test.drop(test.columns, axis=1).to_csv(QML_DATA_DIR + 'ids_test.csv')
In [104]:
#pd.get_dummies(train_y, prefix='', prefix_sep='')[['high', 'medium', 'low']].to_csv(QML_TRAIN_Y_FILE_MASK)
train_y.replace({'high':0, 'medium':1, 'low':2}).to_csv(QML_TRAIN_Y_FILE_MASK)
In [79]:
res = all[['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price']]
In [99]:
Out[99]:
In [83]:
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
DATA_ID = 1
res.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
res.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))
In [107]:
In [110]:
res = all[['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price']]
res["num_photos"] = all["photos"].apply(len)
res["num_features"] = all["features"].apply(len)
res["num_description_words"] = all["description"].apply(lambda x: len(x.split(" ")))
all["created"] = pd.to_datetime(all["created"])
res["created_year"] = all["created"].dt.year
res["created_month"] = all["created"].dt.month
res["created_day"] = all["created"].dt.day
res["created_ts"] = (all["created"] - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
In [111]:
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
DATA_ID = 2
res.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
res.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))
In [40]:
In [113]:
DATA_ID = 3
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(2), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(2), index_col='listing_id')
all = pd.concat([train, test])
all['coord_1'] = all['latitude'] + all['longitude']
all['coord_2'] = all['latitude'] - all['longitude']
all['coord_3'] = all['latitude'] * all['longitude']
res.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
res.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [117]:
DATA_ID = 4
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(3), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(3), index_col='listing_id')
all = pd.concat([train, test])
#train['created'] = pd.to_datetime(train['created'])
dt = all['created_ts'] * np.timedelta64(1, 's') + np.datetime64('1970-01-01T00:00:00Z')
all['created_hour'] = dt.dt.hour
all['created_weekday'] = dt.dt.weekday
all['created_week'] = dt.dt.week
all['created_quarter'] = dt.dt.quarter
all['created_weekend'] = ((all['created_weekday'] == 5) & (all['created_weekday'] == 6))
all['created_wd'] = ((all['created_weekday'] != 5) & (all['created_weekday'] != 6))
res.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
res.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [21]:
DATA_ID = 5
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(4), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(4), index_col='listing_id')
all = pd.concat([train, test])
raw1 = pd.read_json(QML_DATA_DIR + "raw/train.json")
raw1.set_index('listing_id', inplace=True)
raw2 = pd.read_json(QML_DATA_DIR + "raw/test.json")
raw2.set_index('listing_id', inplace=True)
raw = pd.concat([raw1, raw2])
repl_dict = raw1.groupby('manager_id')['manager_id'].count().sort_values(ascending=False).to_dict()
for i in raw['manager_id']:
if i not in repl_dict:
repl_dict[i] = 0
all['man_objects_count'] = raw['manager_id'].replace(repl_dict)
all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))
In [15]:
In [22]:
DATA_ID = 6
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(5), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(5), index_col='listing_id')
all = pd.concat([train, test])
raw1 = pd.read_json(QML_DATA_DIR + "raw/train.json")
raw1.set_index('listing_id', inplace=True)
raw2 = pd.read_json(QML_DATA_DIR + "raw/test.json")
raw2.set_index('listing_id', inplace=True)
raw = pd.concat([raw1, raw2])
repl_dict1 = raw1[raw1['interest_level']=='high'].groupby('manager_id')['manager_id'].count().sort_values(ascending=False).to_dict()
repl_dict2 = raw1[raw1['interest_level']=='medium'].groupby('manager_id')['manager_id'].count().sort_values(ascending=False).to_dict()
repl_dict3 = raw1[raw1['interest_level']=='low'].groupby('manager_id')['manager_id'].count().sort_values(ascending=False).to_dict()
for i in raw['manager_id']:
if i not in repl_dict1:
repl_dict1[i] = 0
if i not in repl_dict2:
repl_dict2[i] = 0
if i not in repl_dict3:
repl_dict3[i] = 0
all['man_objects_high'] = raw['manager_id'].replace(repl_dict1)
all['man_objects_medium'] = raw['manager_id'].replace(repl_dict2)
all['man_objects_low'] = raw['manager_id'].replace(repl_dict3)
all['man_objects_high_to_count'] = all['man_objects_high'] / (all['man_objects_count'] +1)
all['man_objects_medium_to_count'] = all['man_objects_medium'] / (all['man_objects_count'] +1)
all['man_objects_low_to_count'] = all['man_objects_low'] / (all['man_objects_count'] +1)
all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))
In [22]:
In [22]:
DATA_ID = 7
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(6), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(6), index_col='listing_id')
all = pd.concat([train, test])
all["pricePerBed"] = all['price'] / all['bedrooms']
all["pricePerBath"] = all['price'] / all['bathrooms']
all["pricePerRoom"] = all['price'] / (all['bedrooms'] + all['bathrooms'])
all["bedPerBath"] = all['bedrooms'] / all['bathrooms']
all["bedBathDiff"] = all['bedrooms'] - all['bathrooms']
all["bedBathSum"] = all["bedrooms"] + all['bathrooms']
all["bedsPerc"] = all["bedrooms"] / (all['bedrooms'] + all['bathrooms'])
all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))
In [22]:
In [27]:
DATA_ID = 8
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(7), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(7), index_col='listing_id')
all = pd.concat([train, test])
raw1 = pd.read_json(QML_DATA_DIR + "raw/train.json")
raw1.set_index('listing_id', inplace=True)
raw2 = pd.read_json(QML_DATA_DIR + "raw/test.json")
raw2.set_index('listing_id', inplace=True)
raw = pd.concat([raw1, raw2])
repl_dict = raw1.groupby('manager_id')['manager_id'].count().sort_values(ascending=False).to_dict()
for i in raw['manager_id']:
if i not in repl_dict:
repl_dict[i] = 0
all['man_objects_count'] = raw['manager_id'].replace(repl_dict)
all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))
In [28]:
DATA_ID = 9
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(8), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(8), index_col='listing_id')
all = pd.concat([train, test])
raw1 = pd.read_json(QML_DATA_DIR + "raw/train.json")
raw1.set_index('listing_id', inplace=True)
raw2 = pd.read_json(QML_DATA_DIR + "raw/test.json")
raw2.set_index('listing_id', inplace=True)
raw = pd.concat([raw1, raw2])
repl_dict = raw1.groupby('manager_id')['manager_id'].count().sort_values(ascending=False).to_dict()
repl_dict_mean = np.mean([x for x in repl_dict.values()])
for i in raw['manager_id']:
if i not in repl_dict:
repl_dict[i] = repl_dict_mean
all['man_objects_count'] = raw['manager_id'].replace(repl_dict)
repl_dict1 = raw1[raw1['interest_level']=='high'].groupby('manager_id')['manager_id'].count().sort_values(ascending=False).to_dict()
repl_dict2 = raw1[raw1['interest_level']=='medium'].groupby('manager_id')['manager_id'].count().sort_values(ascending=False).to_dict()
repl_dict3 = raw1[raw1['interest_level']=='low'].groupby('manager_id')['manager_id'].count().sort_values(ascending=False).to_dict()
repl_dict_mean1 = np.mean([x for x in repl_dict1.values()])
repl_dict_mean2 = np.mean([x for x in repl_dict2.values()])
repl_dict_mean3 = np.mean([x for x in repl_dict3.values()])
for i in raw['manager_id']:
if i not in repl_dict1:
repl_dict1[i] = repl_dict_mean1
if i not in repl_dict2:
repl_dict2[i] = repl_dict_mean2
if i not in repl_dict3:
repl_dict3[i] = repl_dict_mean3
all['man_objects_high'] = raw['manager_id'].replace(repl_dict1)
all['man_objects_medium'] = raw['manager_id'].replace(repl_dict2)
all['man_objects_low'] = raw['manager_id'].replace(repl_dict3)
all['man_objects_high_to_count'] = all['man_objects_high'] / (all['man_objects_count'] +1)
all['man_objects_medium_to_count'] = all['man_objects_medium'] / (all['man_objects_count'] +1)
all['man_objects_low_to_count'] = all['man_objects_low'] / (all['man_objects_count'] +1)
all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [5]:
DATA_ID = 10
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(8), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(8), index_col='listing_id')
all = pd.concat([train, test])
raw1 = pd.read_json(QML_DATA_DIR + "raw/train.json")
raw1.set_index('listing_id', inplace=True)
raw2 = pd.read_json(QML_DATA_DIR + "raw/test.json")
raw2.set_index('listing_id', inplace=True)
raw = pd.concat([raw1, raw2])
repl_dict = raw1.groupby('building_id')['building_id'].count().sort_values(ascending=False).to_dict()
for i in raw['building_id']:
if i not in repl_dict:
repl_dict[i] = 0
all['build_objects_count'] = raw['building_id'].replace(repl_dict)
repl_dict1 = raw1[raw1['interest_level']=='high'].groupby('building_id')['building_id'].count().sort_values(ascending=False).to_dict()
repl_dict2 = raw1[raw1['interest_level']=='medium'].groupby('building_id')['building_id'].count().sort_values(ascending=False).to_dict()
repl_dict3 = raw1[raw1['interest_level']=='low'].groupby('building_id')['building_id'].count().sort_values(ascending=False).to_dict()
for i in raw['building_id']:
if i not in repl_dict1:
repl_dict1[i] = 0
if i not in repl_dict2:
repl_dict2[i] = 0
if i not in repl_dict3:
repl_dict3[i] = 0
all['build_objects_high'] = raw['building_id'].replace(repl_dict1)
all['build_objects_medium'] = raw['building_id'].replace(repl_dict2)
all['build_objects_low'] = raw['building_id'].replace(repl_dict3)
all['build_objects_high_to_count'] = all['build_objects_high'] / (all['build_objects_count'] +1)
all['build_objects_medium_to_count'] = all['build_objects_medium'] / (all['build_objects_count'] +1)
all['build_objects_low_to_count'] = all['build_objects_low'] / (all['build_objects_count'] +1)
all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [13]:
DATA_ID = 11
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(10), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(10), index_col='listing_id')
all = pd.concat([train, test])
del all['build_objects_count']
del all['build_objects_high']## = raw['building_id'].replace(repl_dict1)
del all['build_objects_medium']# = raw['building_id'].replace(repl_dict2)
del all['build_objects_low']# = raw['building_id'].replace(repl_dict3)
del all['build_objects_high_to_count']# = all['build_objects_high'] / (all['build_objects_count'] +1)
del all['build_objects_medium_to_count']# = all['build_objects_medium'] / (all['build_objects_count'] +1)
del all['build_objects_low_to_count']# = all['build_objects_low'] / (all['build_objects_count'] +1)
del all['man_objects_count']
del all['man_objects_high']## = raw['maning_id'].replace(repl_dict1)
del all['man_objects_medium']# = raw['maning_id'].replace(repl_dict2)
del all['man_objects_low']# = raw['maning_id'].replace(repl_dict3)
del all['man_objects_high_to_count']# = all['man_objects_high'] / (all['man_objects_count'] +1)
del all['man_objects_medium_to_count']# = all['man_objects_medium'] / (all['man_objects_count'] +1)
del all['man_objects_low_to_count']# = all['man_objects_low'] / (all['man_objects_count'] +1)
all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [15]:
DATA_ID = 12
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(11), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(11), index_col='listing_id')
all = pd.concat([train, test])
all['listing_id'] = all.index.values
all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))
In [16]:
all.columns
Out[16]:
In [25]:
DATA_ID = 13
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(12), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(12), index_col='listing_id')
all = pd.concat([train, test])
addr = {
'145 28 Street' : (-73.99244, 40.74708),
'Van Sicklen street' : ( -73.97504, 40.59679),
'219 E 28th' : ( -73.97982, 40.74179),
'1632 Madison Ave' : ( -73.94847, 40.79576),
'41-42 24th St' : ( -73.94131, 40.75153),
'450 East 83rd Street' : ( -73.94899, 40.77399),
'247 west 87' : ( -73.97555, 40.78888),
'118 W 109th' : ( -73.96273, 40.8015),
'246 Mott St' : ( -73.99466, 40.72328),
'21 W 106th' : ( -73.96095, 40.79874),
'338 e. 53':( -73.96576, 40.75591),
'259 Decatur Street':( -73.93344, 40.68165)
}
all['longitude'].replace({0:all['longitude'].mean()}, inplace=True)
all['latitude'].replace({0:all['latitude'].mean()}, inplace=True)
all['coord_1'] = all['latitude'] + all['longitude']
all['coord_2'] = all['latitude'] - all['longitude']
all['coord_3'] = all['latitude'] * all['longitude']
#ny_lat = 40.785091
#ny_lon = -73.968285
#all['dist_center'] = ((all['longitude'] - ny_lon)**2 + (all['latitude'] - ny_lat)**2)**0.5
all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))
In [17]:
In [26]:
DATA_ID = 14
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(13), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(13), index_col='listing_id')
all = pd.concat([train, test])
ny_lat = 40.785091
ny_lon = -73.968285
all['dist_center'] = ((all['longitude'] - ny_lon)**2 + (all['latitude'] - ny_lat)**2)**0.5
all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [60]:
DATA_ID = 15
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(14), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(14), index_col='listing_id')
all = pd.concat([train, test])
raw1 = pd.read_json(QML_DATA_DIR + "raw/train.json")
raw1.set_index('listing_id', inplace=True)
raw2 = pd.read_json(QML_DATA_DIR + "raw/test.json")
raw2.set_index('listing_id', inplace=True)
raw = pd.concat([raw1, raw2])
col = raw.groupby('manager_id')['manager_id'].count().sort_values(ascending=False)
col[200:] = None
all['man'] = raw['manager_id'].replace(col[200:].to_dict())
all = pd.get_dummies(all, columns=['man'])
all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))
In [59]:
Out[59]:
In [3]:
DATA_ID = 16
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(14), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(14), index_col='listing_id')
all = pd.concat([train, test])
raw1 = pd.read_json(QML_DATA_DIR + "raw/train.json")
raw1.set_index('listing_id', inplace=True)
raw2 = pd.read_json(QML_DATA_DIR + "raw/test.json")
raw2.set_index('listing_id', inplace=True)
raw = pd.concat([raw1, raw2])
col = raw.groupby('building_id')['building_id'].count().sort_values(ascending=False)
col[200:] = None
all['building'] = raw['building_id'].replace(col[200:].to_dict())
all = pd.get_dummies(all, columns=['building'])
all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [8]:
DATA_ID = 17
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(14), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(14), index_col='listing_id')
all = pd.concat([train, test])
raw1 = pd.read_json(QML_DATA_DIR + "raw/train.json")
raw1.set_index('listing_id', inplace=True)
raw2 = pd.read_json(QML_DATA_DIR + "raw/test.json")
raw2.set_index('listing_id', inplace=True)
raw = pd.concat([raw1, raw2])
col = raw.groupby('display_address')['display_address'].count().sort_values(ascending=False)
col[200:] = None
all['display_address'] = raw['display_address'].replace(col[200:].to_dict())
all = pd.get_dummies(all, columns=['display_address'])
all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
DATA_ID = 18
ids_train = pd.read_csv(QML_DATA_DIR + 'ids_train.csv', index_col='listing_id').index
ids_test = pd.read_csv(QML_DATA_DIR + 'ids_test.csv', index_col='listing_id').index
train = pd.read_csv(QML_TRAIN_X_FILE_MASK.format(14), index_col='listing_id')
test = pd.read_csv(QML_TEST_X_FILE_MASK.format(14), index_col='listing_id')
all = pd.concat([train, test])
raw1 = pd.read_json(QML_DATA_DIR + "raw/train.json")
raw1.set_index('listing_id', inplace=True)
raw2 = pd.read_json(QML_DATA_DIR + "raw/test.json")
raw2.set_index('listing_id', inplace=True)
raw = pd.concat([raw1, raw2])
col = raw.groupby('display_address')['display_address'].count().sort_values(ascending=False)
col[200:] = None
all['display_address'] = raw['display_address'].replace(col[200:].to_dict())
all = pd.get_dummies(all, columns=['display_address'])
all.loc[ids_train].to_csv(QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]: