根据链家最近7天,最近30天看房记录,判断市场火热度,以及某房型的火热度


In [1]:
import requests,time,os,csv,re,logging
from bs4 import BeautifulSoup

获取html


In [2]:
def get_html(url):
    try:
        r=requests.get(url,timeout=16)
        status=r.status_code
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        now_time=time.strftime('%Y%m%d%H%M%S')
        logging.info(now_time +'      ' +'get_html 异常, ' '    url:'+ url)

尝试获取n次


In [3]:
def get_n_times(page_url,n=5):
    #获取页面失败或服务器错误时,重试n次
    i=1
    sleep_time=0.1
    index_page=get_html(page_url)
    
    while index_page==None or "小差" in index_page:
        if i > n:
            break
        else:
            time.sleep(sleep_time)
            index_page=get_html(page_url)
            sleep_time+=0.1
            i+=1                
                               
    return index_page

获取某 tag 下总页数


In [4]:
def get_total_page(tag_url):
    page_url=tag_url+'d1'
    index_page=get_n_times(page_url)
    
    if index_page==None or "小差" in index_page:
        total_page=101
    else:
        soup=BeautifulSoup(index_page,'html.parser')
        taged_total_num=soup.find('div',attrs={'class':'resultDes clear'}).span.string   #共找到多少套
#         total_page=int(taged_total_num)//30+1     #每页最多显示30套
#         if total_page>100:
#             total_page=100   #总数超过3000套只显示前3000套
        page_data = soup.find('div',attrs={'class':'page-box house-lst-page-box'})['page-data']
        total_page=eval(page_data)['totalPage']
    return taged_total_num,total_page

页面解析并写入csv


In [49]:
def parse_index_page(date,index_page,tag,page_num,total_page):
    path=os.path.join(os.getcwd(),'results\\%s_index_info_all.csv')  %date #L2:两室,P21:200万以下
    fieldnames=['date','tag','taged_total_num','total_page','page_num',
                'key','title','total_price','total_price_unit','per_price','per_price_unit','prop1']
    if  not os.path.exists(path):     #如果文件不存在,创建文件并写入表头
        csv_file=open(path,'w',newline='')
        writer=csv.DictWriter(csv_file,fieldnames=fieldnames)
        writer.writeheader()
        csv_file.close()
        
    csv_file=open(path,'a+',newline='')
    writer=csv.DictWriter(csv_file,fieldnames=fieldnames)
    
    item={}
    
    soup=BeautifulSoup(index_page,'html.parser')
    
    #get_basic_info
    
    taged_total_num=soup.find('div',attrs={'class':'resultDes clear'}).span.string
    
    
    
    
    #update_basic_info
    item.update({'date':date,'tag':tag,'taged_total_num':taged_total_num,'total_page':total_page,'page_num':page_num})
    
    
    index_info=soup.find_all('a',attrs={'class':'text link-hover-green js_triggerGray js_fanglist_title'})
    for a in index_info:
        key=a['key']
        title=a['title']
        item.update({'key':key,'title':title})
        
        price_div=a.parent.next_sibling.next_sibling
        total_price=price_div.find('span',attrs={'class':'total-price strong-num'}).string
        total_price_unit=price_div.find('span',attrs={'class':'unit'}).string
        per_price_info=price_div.find('span',attrs={'class':'info-col price-item minor'}).string
        per_price_info=re.search('(\d+)(\S{3})',per_price_info).groups()
        per_price=per_price_info[0]
        per_price_unit=per_price_info[1]
        
        prop_div=price_div.next_sibling.next_sibling
        props=prop_div.find_all('span',attrs={'class':'c-prop-tag2'})
        
        if len(props)>0:
            if "距离" in props[0].string:
                prop1=props[0].string
            else:
                prop1=""
        else:
            prop1=""
            
#         prop2=props[1].string
#         prop3=props[2].string
        
        item.update({'total_price':total_price,'total_price_unit':total_price_unit,'per_price':per_price,
                    'per_price_unit':per_price_unit,'prop1':prop1})
        
        writer.writerow(item)    #定义writer2csv函数,每次调用函数写入item,结果是重复写入最后一个item,改用writer写入,无问题
        
    csv_file.close()

链接:

西渡/两室/三室:https://sh.lianjia.com/ershoufang/xidu/l2l3/

输入“限制条件”后的 base_url 输出所有的房子的数量,及 id


In [6]:
base_url= 'https://sh.lianjia.com/ershoufang/xidu/l2l3/'
date=time.strftime('%Y%m%d')
# log_dir=os.path.join(os.getcwd(),'results/get_index.log')
# logging.basicConfig(filename=log_dir,level=logging.INFO)

In [7]:
def get_thispage_ids(url):
    house_ids=[]    #每运行一次要对house_ids重新指定一个空的列表,否则house_ids会累积上次运行的结果
    index_page = get_n_times(url)
    soup = BeautifulSoup(index_page, 'html.parser')
    lis = soup.find_all('li', attrs={'class':'clear LOGCLICKDATA'})
    for li in lis:
        if li.contents[0].has_attr('data-housecode'):
            house_code = li.contents[0]['data-housecode']
            house_ids.append(house_code)
        if li.contents[0].has_attr('data-lj_action_housedel_id'):
            house_code = li.contents[0]['data-lj_action_housedel_id']
            house_ids.append(house_code)
    
    return house_ids

In [8]:
def get_ids(base_url):
    ids= []
    n_house, n_page = get_total_page(base_url)
    urls = [base_url[:-1]  +'pg' + str(i) + '/' for i in range(1,n_page+1)]
    for url in urls:
        thispage_ids = get_thispage_ids(url)
        ids += thispage_ids
#         ids.append(thispage_ids)  #此方法,只会更新变量thispage_ids最后的值
    return n_house, ids

In [9]:
n_house, ids = get_ids(base_url)

In [11]:
n_house


Out[11]:
' 212 '

In [13]:
url_230='https://sh.lianjia.com/ershoufang/xidu/l2l3bp0ep230/'

In [14]:
n_house_230,ids_230 = get_ids(url_230)

In [10]:
house_id = '107000859767'

In [18]:
house_url = 'https://sh.lianjia.com/ershoufang/107100413346.html'

In [19]:
house_page = get_n_times(house_url)

In [20]:
soup = BeautifulSoup(house_page, 'html.parser')

In [21]:
import re

In [22]:
divs = soup.find_all('div', attrs={'id':'record'})

In [24]:
divs


Out[24]:
[<div class="record" id="record">
 <div class="list">
 <div class="title">看房记录
                   <span class="next disable"><i></i></span>
 <span class="pre disable"><i></i></span>
 </div>
 <div class="content">
 <div class="record-header">
 <div class="item mytime">带看时间</div>
 <div class="item myname">带看经纪人</div>
 <div class="item mytotal">本房总带看</div>
 <div class="phone" style="margin-left:12px;">咨询电话</div>
 </div>
 <div class="row"><span class="noData">暂无看房记录</span></div>
 </div>
 </div>
 <div class="panel">
 <div class="panel-title">近7天带看次数</div>
 <div class="count">0</div>
 <div class="totalCount">- 30日带看<span>0</span>次 -</div>
 <!-- <div class="msyy">马上预约看房</div> -->
 </div>
 </div>]

In [115]:
aa = soup.find_all(text=re.compile('近7天.*?'))

In [102]:
aa[0].next_sibling.next_sibling.next_sibling.next_sibling


Out[102]:
<div class="totalCount">- 30日带看<span>0</span>次 -</div>