根据链家最近7天,最近30天看房记录,判断市场火热度,以及某房型的火热度
In [1]:
import requests,time,os,csv,re,logging
from bs4 import BeautifulSoup
获取html
In [2]:
def get_html(url):
try:
r=requests.get(url,timeout=16)
status=r.status_code
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
now_time=time.strftime('%Y%m%d%H%M%S')
logging.info(now_time +' ' +'get_html 异常, ' ' url:'+ url)
尝试获取n次
In [3]:
def get_n_times(page_url,n=5):
#获取页面失败或服务器错误时,重试n次
i=1
sleep_time=0.1
index_page=get_html(page_url)
while index_page==None or "小差" in index_page:
if i > n:
break
else:
time.sleep(sleep_time)
index_page=get_html(page_url)
sleep_time+=0.1
i+=1
return index_page
获取某 tag 下总页数
In [4]:
def get_total_page(tag_url):
page_url=tag_url+'d1'
index_page=get_n_times(page_url)
if index_page==None or "小差" in index_page:
total_page=101
else:
soup=BeautifulSoup(index_page,'html.parser')
taged_total_num=soup.find('div',attrs={'class':'resultDes clear'}).span.string #共找到多少套
# total_page=int(taged_total_num)//30+1 #每页最多显示30套
# if total_page>100:
# total_page=100 #总数超过3000套只显示前3000套
page_data = soup.find('div',attrs={'class':'page-box house-lst-page-box'})['page-data']
total_page=eval(page_data)['totalPage']
return taged_total_num,total_page
页面解析并写入csv
In [49]:
def parse_index_page(date,index_page,tag,page_num,total_page):
path=os.path.join(os.getcwd(),'results\\%s_index_info_all.csv') %date #L2:两室,P21:200万以下
fieldnames=['date','tag','taged_total_num','total_page','page_num',
'key','title','total_price','total_price_unit','per_price','per_price_unit','prop1']
if not os.path.exists(path): #如果文件不存在,创建文件并写入表头
csv_file=open(path,'w',newline='')
writer=csv.DictWriter(csv_file,fieldnames=fieldnames)
writer.writeheader()
csv_file.close()
csv_file=open(path,'a+',newline='')
writer=csv.DictWriter(csv_file,fieldnames=fieldnames)
item={}
soup=BeautifulSoup(index_page,'html.parser')
#get_basic_info
taged_total_num=soup.find('div',attrs={'class':'resultDes clear'}).span.string
#update_basic_info
item.update({'date':date,'tag':tag,'taged_total_num':taged_total_num,'total_page':total_page,'page_num':page_num})
index_info=soup.find_all('a',attrs={'class':'text link-hover-green js_triggerGray js_fanglist_title'})
for a in index_info:
key=a['key']
title=a['title']
item.update({'key':key,'title':title})
price_div=a.parent.next_sibling.next_sibling
total_price=price_div.find('span',attrs={'class':'total-price strong-num'}).string
total_price_unit=price_div.find('span',attrs={'class':'unit'}).string
per_price_info=price_div.find('span',attrs={'class':'info-col price-item minor'}).string
per_price_info=re.search('(\d+)(\S{3})',per_price_info).groups()
per_price=per_price_info[0]
per_price_unit=per_price_info[1]
prop_div=price_div.next_sibling.next_sibling
props=prop_div.find_all('span',attrs={'class':'c-prop-tag2'})
if len(props)>0:
if "距离" in props[0].string:
prop1=props[0].string
else:
prop1=""
else:
prop1=""
# prop2=props[1].string
# prop3=props[2].string
item.update({'total_price':total_price,'total_price_unit':total_price_unit,'per_price':per_price,
'per_price_unit':per_price_unit,'prop1':prop1})
writer.writerow(item) #定义writer2csv函数,每次调用函数写入item,结果是重复写入最后一个item,改用writer写入,无问题
csv_file.close()
链接:
输入“限制条件”后的 base_url 输出所有的房子的数量,及 id
In [6]:
base_url= 'https://sh.lianjia.com/ershoufang/xidu/l2l3/'
date=time.strftime('%Y%m%d')
# log_dir=os.path.join(os.getcwd(),'results/get_index.log')
# logging.basicConfig(filename=log_dir,level=logging.INFO)
In [7]:
def get_thispage_ids(url):
house_ids=[] #每运行一次要对house_ids重新指定一个空的列表,否则house_ids会累积上次运行的结果
index_page = get_n_times(url)
soup = BeautifulSoup(index_page, 'html.parser')
lis = soup.find_all('li', attrs={'class':'clear LOGCLICKDATA'})
for li in lis:
if li.contents[0].has_attr('data-housecode'):
house_code = li.contents[0]['data-housecode']
house_ids.append(house_code)
if li.contents[0].has_attr('data-lj_action_housedel_id'):
house_code = li.contents[0]['data-lj_action_housedel_id']
house_ids.append(house_code)
return house_ids
In [8]:
def get_ids(base_url):
ids= []
n_house, n_page = get_total_page(base_url)
urls = [base_url[:-1] +'pg' + str(i) + '/' for i in range(1,n_page+1)]
for url in urls:
thispage_ids = get_thispage_ids(url)
ids += thispage_ids
# ids.append(thispage_ids) #此方法,只会更新变量thispage_ids最后的值
return n_house, ids
In [9]:
n_house, ids = get_ids(base_url)
In [11]:
n_house
Out[11]:
In [13]:
url_230='https://sh.lianjia.com/ershoufang/xidu/l2l3bp0ep230/'
In [14]:
n_house_230,ids_230 = get_ids(url_230)
In [10]:
house_id = '107000859767'
In [18]:
house_url = 'https://sh.lianjia.com/ershoufang/107100413346.html'
In [19]:
house_page = get_n_times(house_url)
In [20]:
soup = BeautifulSoup(house_page, 'html.parser')
In [21]:
import re
In [22]:
divs = soup.find_all('div', attrs={'id':'record'})
In [24]:
divs
Out[24]:
In [115]:
aa = soup.find_all(text=re.compile('近7天.*?'))
In [102]:
aa[0].next_sibling.next_sibling.next_sibling.next_sibling
Out[102]: