In [1]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
df0=pd.read_csv('qdCJInfo_1.txt',sep='\t')
In [2]:
print df0.head()
In [3]:
df1=pd.read_csv('qdXqInfo.txt',sep='\t')
In [4]:
print df1.head()
In [5]:
# 通过id与href 将两张表join在一起
# 另外房屋信息需要重新拆分
print df0.iloc[0:5,0:7]
print df1.iloc[0:5,0:9]
In [6]:
df0=df0.rename(columns={'id':'href'})
In [7]:
df_pre=pd.merge(df0,df1,on=['href'])
In [8]:
del df_pre['Unnamed: 7']
del df_pre['Unnamed: 9']
In [9]:
# 通过字符匹配,房屋信息拆分
# bedroom living (numbers)
# ishigh ismedian islow (中低高层)
# typenum (numbers)
# housetype
import re
import datetime
import time
df_pre['bedroom']=[int(unicode(df_pre['房屋信息'].iloc[i], encoding='utf-8')[0]) for i in range(0,len(df_pre))]
df_pre['living']=[int(unicode(df_pre['房屋信息'].iloc[i], encoding='utf-8')[2]) for i in range(0,len(df_pre))]
df_pre['ishigh']=[int(len(re.findall(u'高层',unicode(df_pre['房屋信息'].iloc[i], encoding='utf-8')))>=1 is not None) for i in range(0,len(df_pre))]
df_pre['ismedian']=[int(len(re.findall(u'中层',unicode(df_pre['房屋信息'].iloc[i], encoding='utf-8')))>=1 is not None) for i in range(0,len(df_pre))]
df_pre['islow']=[int(len(re.findall(u'低层',unicode(df_pre['房屋信息'].iloc[i], encoding='utf-8')))>=1 is not None) for i in range(0,len(df_pre))]
df_pre['isunknow']=[int(len(re.findall(u'未知层',unicode(df_pre['房屋信息'].iloc[i], encoding='utf-8')))>=1 is not None) for i in range(0,len(df_pre))]
df_pre['typenum']=[int(unicode(df_pre['房屋信息'].iloc[i], encoding='utf-8')
[[m.start() for m in re.finditer(u'共', unicode(df_pre['房屋信息'].iloc[i], encoding='utf-8'))][0]+1]) for i in range(0,len(df_pre))]
df_pre['housetype']=[int(len(re.findall(u'南',unicode(df_pre['房屋信息'].iloc[i], encoding='utf-8')))>=1 is not None) for i in range(0,len(df_pre))]
# square (numbers)
df_pre=df_pre.rename(columns={'面积':'square'})
df_pre['square']=[float(unicode(df_pre['square'].iloc[i], encoding='utf-8')[:-1]) for i in range(0,len(df_pre))]
# timelen (numbers)
# 计算签约时间距离建成时间的天数,默认是每年的年末建成
df_pre=df_pre.rename(columns={'签约时间':'timelen'})
df_pre['timelen']=[(datetime.datetime(*time.strptime(df_pre['timelen'].iloc[i],'%Y-%m-%d')[:6])-datetime.datetime(int(df_pre['year'].iloc[0][:4]),12,31)).days for i in range(0,len(df_pre))]
# strikeprice (numbers)
df_pre=df_pre.rename(columns={'成交价':'strikeprice'})
df_pre['strikeprice']=[float(unicode(df_pre['strikeprice'].iloc[i], encoding='utf-8')[:-1]) for i in range(0,len(df_pre))]
# unitprice
df_pre=df_pre.rename(columns={'单价':'unitprice'})
df_pre['unitprice']=[float(unicode(df_pre['unitprice'].iloc[i], encoding='utf-8')[:-3]) for i in range(0,len(df_pre))]
In [10]:
columnname=['square','strikeprice','unitprice','timelen','sell','rent',
'bedroom','living','ishigh','ismedian','islow','isunknow','typenum','housetype']
In [11]:
df_model=df_pre[columnname]
In [12]:
df_model
Out[12]:
In [15]:
# 数据标准化
# 拆分训练集与测试集
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
df_build=pd.DataFrame(preprocessing.minmax_scale(df_model),columns=columnname)
X=df_build.drop(['unitprice'],axis=1)
Y=df_build['unitprice']
In [32]:
# 建立unitprice与其他变量之间的关系
# 使用keras建立神经网络
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size = 0.3)
xx_train=np.array(X_train)
yy_train=np.array(Y_train)
xx_test=np.array(X_test)
yy_test=np.array(Y_test)
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
def baseline_model():
# create model
model = Sequential()
model.add(Dense(20, input_dim=13, init='normal', activation='relu'))
model.add(Dense(1, init='normal'))
# Compile model
model.compile(loss='mean_squared_error', optimizer='adam')
return model
seed = 7
np.random.seed(seed)
# evaluate model with standardized dataset
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, nb_epoch=50, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
# use 10-fold cross validation to evaluate this baseline model
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(pipeline, X, Y, cv=kfold)
print("Standardized: %.6f (%.6f) MSE" % (results.mean(), results.std()))
In [54]:
df_predict=pd.concat([pd.DataFrame(pipeline.predict(xx_test)*(max(df_model['unitprice'])-min(df_model['unitprice']))+min(df_model['unitprice'])),pd.DataFrame(yy_test*(max(df_model['unitprice'])-min(df_model['unitprice']))+min(df_model['unitprice']))],axis=1)
df_predict.columns=['predict','original']
df_predict['diffpercent']=[(df_predict['predict'].iloc[i]-df_predict['original'].iloc[i])/df_predict['original'].iloc[i]*100 for i in range(0,len(df_predict))]
df_predict
Out[54]:
In [49]:
In [ ]: