In [1]:
# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
df0=pd.read_csv('qdCJInfo_1.txt',sep='\t')

In [2]:
print df0.head()


                房屋信息      面积        签约时间   成交价        单价     来源  \
0    2室2厅|高层(共9层)|南北  122.6㎡  2016-11-18  326万  26591元/㎡  房天下成交   
1    2室2厅|低层(共20层)|南    105㎡  2016-10-14  223万  21238元/㎡   市场信息   
2    1室2厅|中层(共19层)|南     70㎡  2016-10-14  147万  21000元/㎡   市场信息   
3   2室2厅|低层(共9层)|南 北    118㎡  2016-10-05  309万  26186元/㎡   市场信息   
4  2室2厅|中层(共18层)|南 北    134㎡  2016-10-05  287万  21418元/㎡   市场信息   

                                       id  Unnamed: 7  
0  http://luxinchangchunhuayuan.fang.com/         NaN  
1  http://luxinchangchunhuayuan.fang.com/         NaN  
2  http://luxinchangchunhuayuan.fang.com/         NaN  
3  http://luxinchangchunhuayuan.fang.com/         NaN  
4  http://luxinchangchunhuayuan.fang.com/         NaN  

In [3]:
df1=pd.read_csv('qdXqInfo.txt',sep='\t')

In [4]:
print df1.head()


   sell city center     year                                    href  rent  \
0  4122   崂山  颐中体育场  2006年建成  http://luxinchangchunhuayuan.fang.com/   415   
1  2146   崂山  颐中体育场  2006年建成       http://dongchengguojihe.fang.com/   251   
2  1642   黄岛    薛家岛  2014年建成           http://shanhaiwanhe.fang.com/   117   
3  1533   黄岛    薛家岛  2010年建成       http://fenghuangchengqd.fang.com/    29   
4  1471   城阳   世纪公园  2010年建成    http://meilizhichengwk0532.fang.com/   113   

                      address        cityHref    name  Unnamed: 9  
0              崂山-颐中体育场银川东路1号   /house-a0393/  鲁信长春花园         NaN  
1             崂山-颐中体育场辽阳东路16号   /house-a0393/  海尔东城国际         NaN  
2  黄岛-薛家岛漓江东路505号(薛家岛旅游度假区北侧)  /house-a01142/     山海湾         NaN  
3    黄岛-薛家岛珠江路106号(薛家岛旅游度假区内)  /house-a01142/     凤凰城         NaN  
4              城阳-世纪公园兴阳路417号   /house-a0394/  万科魅力之城         NaN  

In [5]:
# 通过id与href 将两张表join在一起
# 另外房屋信息需要重新拆分

print df0.iloc[0:5,0:7]
print df1.iloc[0:5,0:9]


                房屋信息      面积        签约时间   成交价        单价     来源  \
0    2室2厅|高层(共9层)|南北  122.6㎡  2016-11-18  326万  26591元/㎡  房天下成交   
1    2室2厅|低层(共20层)|南    105㎡  2016-10-14  223万  21238元/㎡   市场信息   
2    1室2厅|中层(共19层)|南     70㎡  2016-10-14  147万  21000元/㎡   市场信息   
3   2室2厅|低层(共9层)|南 北    118㎡  2016-10-05  309万  26186元/㎡   市场信息   
4  2室2厅|中层(共18层)|南 北    134㎡  2016-10-05  287万  21418元/㎡   市场信息   

                                       id  
0  http://luxinchangchunhuayuan.fang.com/  
1  http://luxinchangchunhuayuan.fang.com/  
2  http://luxinchangchunhuayuan.fang.com/  
3  http://luxinchangchunhuayuan.fang.com/  
4  http://luxinchangchunhuayuan.fang.com/  
   sell city center     year                                    href  rent  \
0  4122   崂山  颐中体育场  2006年建成  http://luxinchangchunhuayuan.fang.com/   415   
1  2146   崂山  颐中体育场  2006年建成       http://dongchengguojihe.fang.com/   251   
2  1642   黄岛    薛家岛  2014年建成           http://shanhaiwanhe.fang.com/   117   
3  1533   黄岛    薛家岛  2010年建成       http://fenghuangchengqd.fang.com/    29   
4  1471   城阳   世纪公园  2010年建成    http://meilizhichengwk0532.fang.com/   113   

                      address        cityHref    name  
0              崂山-颐中体育场银川东路1号   /house-a0393/  鲁信长春花园  
1             崂山-颐中体育场辽阳东路16号   /house-a0393/  海尔东城国际  
2  黄岛-薛家岛漓江东路505号(薛家岛旅游度假区北侧)  /house-a01142/     山海湾  
3    黄岛-薛家岛珠江路106号(薛家岛旅游度假区内)  /house-a01142/     凤凰城  
4              城阳-世纪公园兴阳路417号   /house-a0394/  万科魅力之城  

In [6]:
df0=df0.rename(columns={'id':'href'})

In [7]:
df_pre=pd.merge(df0,df1,on=['href'])

In [8]:
del df_pre['Unnamed: 7']
del df_pre['Unnamed: 9']

In [9]:
# 通过字符匹配,房屋信息拆分

# bedroom living (numbers)
# ishigh ismedian islow (中低高层)
# typenum (numbers)
# housetype

import re
import datetime
import time

df_pre['bedroom']=[int(unicode(df_pre['房屋信息'].iloc[i], encoding='utf-8')[0]) for i in range(0,len(df_pre))]
df_pre['living']=[int(unicode(df_pre['房屋信息'].iloc[i], encoding='utf-8')[2]) for i in range(0,len(df_pre))]
df_pre['ishigh']=[int(len(re.findall(u'高层',unicode(df_pre['房屋信息'].iloc[i], encoding='utf-8')))>=1 is not None) for i in range(0,len(df_pre))]
df_pre['ismedian']=[int(len(re.findall(u'中层',unicode(df_pre['房屋信息'].iloc[i], encoding='utf-8')))>=1 is not None) for i in range(0,len(df_pre))]
df_pre['islow']=[int(len(re.findall(u'低层',unicode(df_pre['房屋信息'].iloc[i], encoding='utf-8')))>=1 is not None) for i in range(0,len(df_pre))]
df_pre['isunknow']=[int(len(re.findall(u'未知层',unicode(df_pre['房屋信息'].iloc[i], encoding='utf-8')))>=1 is not None) for i in range(0,len(df_pre))]
df_pre['typenum']=[int(unicode(df_pre['房屋信息'].iloc[i], encoding='utf-8')
                       [[m.start() for m in re.finditer(u'共', unicode(df_pre['房屋信息'].iloc[i], encoding='utf-8'))][0]+1]) for i in range(0,len(df_pre))]

df_pre['housetype']=[int(len(re.findall(u'南',unicode(df_pre['房屋信息'].iloc[i], encoding='utf-8')))>=1 is not None) for i in range(0,len(df_pre))]

# square (numbers)
df_pre=df_pre.rename(columns={'面积':'square'})
df_pre['square']=[float(unicode(df_pre['square'].iloc[i], encoding='utf-8')[:-1]) for i in range(0,len(df_pre))]

# timelen (numbers)
# 计算签约时间距离建成时间的天数,默认是每年的年末建成
df_pre=df_pre.rename(columns={'签约时间':'timelen'})
df_pre['timelen']=[(datetime.datetime(*time.strptime(df_pre['timelen'].iloc[i],'%Y-%m-%d')[:6])-datetime.datetime(int(df_pre['year'].iloc[0][:4]),12,31)).days for i in range(0,len(df_pre))]

# strikeprice (numbers)
df_pre=df_pre.rename(columns={'成交价':'strikeprice'})
df_pre['strikeprice']=[float(unicode(df_pre['strikeprice'].iloc[i], encoding='utf-8')[:-1]) for i in range(0,len(df_pre))]
# unitprice
df_pre=df_pre.rename(columns={'单价':'unitprice'})
df_pre['unitprice']=[float(unicode(df_pre['unitprice'].iloc[i], encoding='utf-8')[:-3]) for i in range(0,len(df_pre))]

In [10]:
columnname=['square','strikeprice','unitprice','timelen','sell','rent',
            'bedroom','living','ishigh','ismedian','islow','isunknow','typenum','housetype']

In [11]:
df_model=df_pre[columnname]

In [12]:
df_model


Out[12]:
square strikeprice unitprice timelen sell rent bedroom living ishigh ismedian islow isunknow typenum housetype
0 122.60 326.00 26591.0 3610 4122 415 2 2 1 0 0 0 9 1
1 105.00 223.00 21238.0 3575 4122 415 2 2 0 0 1 0 2 1
2 70.00 147.00 21000.0 3575 4122 415 1 2 0 1 0 0 1 1
3 118.00 309.00 26186.0 3566 4122 415 2 2 0 0 1 0 9 1
4 134.00 287.00 21418.0 3566 4122 415 2 2 0 1 0 0 1 1
5 70.25 140.00 19929.0 3564 4122 415 1 2 0 0 1 0 1 1
6 79.00 130.00 16456.0 3564 4122 415 1 2 0 1 0 0 2 1
7 106.00 232.00 21887.0 3562 4122 415 2 2 1 0 0 0 2 1
8 119.00 274.80 23092.0 3561 4122 415 2 2 0 1 0 0 6 1
9 140.80 360.00 25568.0 3558 4122 415 3 2 0 1 0 0 1 1
10 100.00 218.00 21800.0 3557 4122 415 2 2 1 0 0 0 2 1
11 136.00 380.00 27941.0 3556 4122 415 3 2 0 0 1 0 1 1
12 140.00 299.00 21357.0 3555 4122 415 2 2 1 0 0 0 2 1
13 70.00 139.00 19857.0 3549 4122 415 1 2 0 0 1 0 1 1
14 202.00 498.00 24653.0 3548 4122 415 4 2 0 1 0 0 1 1
15 70.00 140.00 20000.0 3548 4122 415 1 2 0 1 0 0 1 1
16 108.00 235.00 21759.0 3544 4122 415 2 2 0 0 1 0 2 1
17 109.00 272.00 24954.0 3541 4122 415 2 2 1 0 0 0 1 1
18 70.00 144.50 20643.0 3540 4122 415 1 2 1 0 0 0 1 1
19 142.00 492.00 34648.0 3539 4122 415 5 2 1 0 0 0 6 1
20 70.00 139.00 19857.0 3538 4122 415 1 2 0 0 1 0 1 1
21 145.00 308.00 21241.0 3536 4122 415 3 2 0 1 0 0 2 1
22 160.00 374.00 23375.0 3524 4122 415 3 2 1 0 0 0 1 1
23 152.00 282.80 18605.0 3523 4122 415 3 2 0 0 1 0 2 1
24 108.00 210.00 19444.0 3504 4122 415 2 2 0 1 0 0 2 1
25 140.00 276.00 19714.0 3503 4122 415 2 2 0 1 0 0 2 1
26 70.00 136.00 19429.0 3493 4122 415 1 2 0 0 1 0 1 1
27 139.87 252.00 18017.0 3487 4122 415 2 1 0 0 1 0 2 1
28 159.00 330.00 20755.0 3487 4122 415 3 2 0 1 0 0 1 1
29 151.00 303.80 20119.0 3486 4122 415 3 2 0 0 1 0 2 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7458 63.00 81.00 12857.0 3379 7 2 2 1 1 0 0 0 7 1
7459 88.00 117.00 13295.0 3551 7 2 2 1 0 0 1 0 2 1
7460 104.00 124.50 11971.0 3548 7 2 2 1 0 1 0 0 2 1
7461 125.86 154.08 12242.0 3528 7 2 3 2 0 0 1 0 2 1
7462 101.72 116.00 11404.0 3135 7 2 2 1 0 1 0 0 3 1
7463 98.96 130.00 13137.0 3366 7 3 3 1 0 0 1 0 5 1
7464 75.52 114.00 15095.0 3584 7 0 2 1 1 0 0 0 7 1
7465 75.00 94.80 12640.0 3524 7 0 2 1 1 0 0 0 7 1
7466 72.00 115.50 16042.0 3379 7 0 2 1 0 0 1 0 8 1
7467 83.00 115.00 13855.0 3227 7 0 2 1 0 1 0 0 8 1
7468 75.00 104.00 13867.0 3227 7 0 2 1 0 0 1 0 7 0
7469 93.86 133.50 14223.0 3131 7 0 3 1 1 0 0 0 6 1
7470 92.00 73.50 7989.0 3348 6 3 2 1 0 0 1 0 2 1
7471 138.00 242.00 17536.0 3409 6 3 3 2 0 1 0 0 6 1
7472 77.80 118.00 15167.0 3165 6 1 1 1 1 0 0 0 1 1
7473 167.47 295.00 17615.0 3426 6 0 3 2 1 0 0 0 1 1
7474 167.47 267.00 15943.0 3416 6 0 3 2 0 1 0 0 1 1
7475 101.53 98.30 9682.0 3347 6 0 3 1 1 0 0 0 1 1
7476 147.00 190.00 12925.0 3319 6 0 3 1 0 1 0 0 1 1
7477 60.00 72.60 12100.0 3288 6 4 2 1 1 0 0 0 7 1
7478 79.00 133.00 16835.0 3348 6 2 2 1 1 0 0 0 6 1
7479 94.72 160.00 16892.0 3283 6 2 3 2 0 1 0 0 6 1
7480 110.70 179.00 16170.0 3237 6 2 3 2 0 0 1 0 6 1
7481 71.00 100.00 14085.0 3528 6 0 2 1 1 0 0 0 7 1
7482 77.00 100.00 12987.0 3319 6 0 2 1 0 0 1 0 7 1
7483 54.38 49.50 9103.0 3373 6 1 1 1 0 1 0 0 7 0
7484 117.00 99.00 8462.0 3510 5 6 3 2 0 1 0 0 6 1
7485 147.00 119.80 8150.0 3510 5 6 3 1 0 1 0 0 6 1
7486 117.00 100.00 8547.0 3379 5 6 3 2 0 1 0 0 6 1
7487 83.11 82.50 9927.0 3543 5 2 2 1 1 0 0 0 7 0

7488 rows × 14 columns


In [15]:
# 数据标准化
# 拆分训练集与测试集

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

df_build=pd.DataFrame(preprocessing.minmax_scale(df_model),columns=columnname)

X=df_build.drop(['unitprice'],axis=1)
Y=df_build['unitprice']

In [32]:
# 建立unitprice与其他变量之间的关系
# 使用keras建立神经网络

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size = 0.3)

xx_train=np.array(X_train)
yy_train=np.array(Y_train)
xx_test=np.array(X_test)
yy_test=np.array(Y_test)


from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(20, input_dim=13, init='normal', activation='relu'))
    model.add(Dense(1, init='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

seed = 7
np.random.seed(seed)
# evaluate model with standardized dataset
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, nb_epoch=50, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
# use 10-fold cross validation to evaluate this baseline model
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(pipeline, X, Y, cv=kfold)
print("Standardized: %.6f (%.6f) MSE" % (results.mean(), results.std()))


Standardized: 0.00 (0.00) MSE

In [54]:
df_predict=pd.concat([pd.DataFrame(pipeline.predict(xx_test)*(max(df_model['unitprice'])-min(df_model['unitprice']))+min(df_model['unitprice'])),pd.DataFrame(yy_test*(max(df_model['unitprice'])-min(df_model['unitprice']))+min(df_model['unitprice']))],axis=1)
df_predict.columns=['predict','original']
df_predict['diffpercent']=[(df_predict['predict'].iloc[i]-df_predict['original'].iloc[i])/df_predict['original'].iloc[i]*100 for i in range(0,len(df_predict))]
df_predict


Out[54]:
predict original diffpercent
0 12928.208008 13036.0 -0.826879
1 5146.416016 5260.0 -2.159391
2 23134.542969 22463.0 2.989552
3 19310.955078 19550.0 -1.222736
4 13963.760742 14217.0 -1.781243
5 10465.264648 11023.0 -5.059742
6 11671.217773 11692.0 -0.177747
7 8640.301758 9529.0 -9.326249
8 8303.351562 8721.0 -4.788997
9 13917.920898 14191.0 -1.924312
10 15431.406250 14286.0 8.017683
11 11077.398438 12568.0 -11.860293
12 11382.454102 12159.0 -6.386593
13 18446.109375 18068.0 2.092702
14 12823.762695 13333.0 -3.819375
15 22901.878906 23467.0 -2.408152
16 21707.519531 22186.0 -2.156677
17 8137.379395 9024.0 -9.825140
18 7545.592285 8280.0 -8.869658
19 11199.341797 11747.0 -4.662111
20 12290.955078 11905.0 3.241958
21 14325.987305 14196.0 0.915661
22 11440.975586 10744.0 6.487115
23 15337.565430 15304.0 0.219325
24 11324.594727 11388.0 -0.556773
25 15774.586914 14503.0 8.767751
26 11434.456055 12233.0 -6.527785
27 8262.401367 9195.0 -10.142454
28 14444.478516 14318.0 0.883353
29 6300.693848 6988.0 -9.835520
... ... ... ...
2217 8366.275391 9195.0 -9.012774
2218 11028.868164 14098.0 -21.769980
2219 22961.964844 24409.0 -5.928285
2220 26906.814453 29433.0 -8.582834
2221 14140.391602 14047.0 0.664851
2222 12684.622070 12781.0 -0.754072
2223 21514.748047 21348.0 0.781094
2224 18188.935547 17550.0 3.640658
2225 18693.710938 17816.0 4.926532
2226 16352.674805 15891.0 2.905260
2227 15869.343750 15555.0 2.020853
2228 16637.585938 16522.0 0.699588
2229 15935.584961 14786.0 7.774821
2230 12132.460938 12696.0 -4.438713
2231 18273.199219 18633.0 -1.930987
2232 16461.332031 16170.0 1.801682
2233 12050.754883 12667.0 -4.864965
2234 13461.581055 14333.0 -6.079808
2235 21738.431641 21721.0 0.080252
2236 17977.498047 17341.0 3.670481
2237 39060.890625 43207.0 -9.595921
2238 22664.082031 24468.0 -7.372560
2239 15892.283203 16117.0 -1.394284
2240 12346.502930 12194.0 1.250639
2241 8822.909180 9088.0 -2.916932
2242 8622.603516 8271.0 4.251040
2243 5975.963867 7363.0 -18.837921
2244 10005.106445 10548.0 -5.146886
2245 16831.728516 16552.0 1.689998
2246 7274.055176 6786.0 7.192089

2247 rows × 3 columns


In [49]:


In [ ]: