In [167]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
In [2]:
def html_stripper(text):
return re.sub('<[^<]+?>', '', str(text))
In [3]:
page = 1
In [4]:
district = 'http://www.cian.ru/cat.php?deal_type=sale&district%5B0%5D=13&district%5B1%5D=14&district%5B2%5D=15&district%5B3%5D=16&district%5B4%5D=17&district%5B5%5D=18&district%5B6%5D=19&district%5B7%5D=20&district%5B8%5D=21&district%5B9%5D=22&engine_version=2&offer_type=flat&p={}&room1=1&room2=1&room3=1&room4=1&room5=1&room6=1'
In [6]:
links = []
for page in range(1, 30):
page_url = district.format(page)
search_page = requests.get(page_url)
search_page = search_page.content
search_page = BeautifulSoup(search_page, 'lxml')
flat_urls = search_page.findAll('div', attrs = {'ng-class':"{'serp-item_removed': offer.remove.state, 'serp-item_popup-opened': isPopupOpen}"})
flat_urls = re.split('http://www.cian.ru/sale/flat/|/" ng-class="', str(flat_urls))
for link in flat_urls:
if link.isdigit():
links.append(link)
In [9]:
flat_url = 'http://www.cian.ru/sale/flat/' + str(links[0]) + '/'
#flat_url = 'http://www.cian.ru/sale/flat/150531912/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
In [567]:
def getPrice(flat_page):
price = flat_page.find('div', attrs={'class':'object_descr_price'})
price = re.split('<div>|руб|\W', str(price))
price = "".join([i for i in price if i.isdigit()][-4:])
return int(price)
In [583]:
def getAllPrices(l, r):
prices = []
for i in range(l, r):
flat_url = 'http://www.cian.ru/sale/flat/' + str(links[i]) + '/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
prices.append(getPrice(flat_page))
return prices
In [589]:
prices = getAllPrices(0, len(links))
In [591]:
flat_url = 'http://www.cian.ru/sale/flat/' + str(links[0]) + '/'
#flat_url = 'http://www.cian.ru/sale/flat/150531912/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
coords = flat_page.find('div', attrs={'class':'map_info_button_extend'}).contents[1]
coords = re.split('&|center=|%2C', str(coords))
In [592]:
coords
Out[592]:
In [593]:
coords_list = []
for item in coords:
if item[0].isdigit():
coords_list.append(item)
lat = float(coords_list[0])
lon = float(coords_list[1])
In [594]:
lat
Out[594]:
In [595]:
lon
Out[595]:
In [596]:
def getCoords_at(flat_page):
coords = flat_page.find('div', attrs={'class':'map_info_button_extend'}).contents[1]
coords = re.split('&|center=|%2C', str(coords))
coords_list = []
for item in coords:
if item[0].isdigit():
coords_list.append(item)
lat = float(coords_list[0])
lon = float(coords_list[1])
return lat, lon
In [812]:
def getAllCoordinates(l, r):
coordinates = []
for i in range(l, r):
flat_url = 'http://www.cian.ru/sale/flat/' + str(links[i]) + '/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
coordinates.append(getCoords(flat_page))
return coordinates
In [599]:
coordinates = getAllCoordinates(0, len(links))
In [649]:
from math import radians, cos, sin, asin, sqrt
AVG_EARTH_RADIUS = 6371
def haversine(point1, point2):
# извлекаем долготу и широту
lat1, lng1 = point1
lat2, lng2 = point2
# переводим все эти значения в радианы
lat1, lng1, lat2, lng2 = map(radians, (lat1, lng1, lat2, lng2))
# вычисляем расстояние по формуле
lat = lat2 - lat1
lng = lng2 - lng1
d = sin(lat * 0.5) ** 2 + cos(lat1) * cos(lat2) * sin(lng * 0.5) ** 2
h = 2 * AVG_EARTH_RADIUS * asin(sqrt(d))
return h
In [651]:
MSC_POINT_ZERO = (55.755831, 37.617673)
distance = []
for i in range(0, len(coordinates)):
distance.append(haversine(MSC_POINT_ZERO, coordinates[i]))
In [612]:
flat_url = 'http://www.cian.ru/sale/flat/' + str(links[2]) + '/'
#flat_url = 'http://www.cian.ru/sale/flat/150844464/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
In [613]:
rooms_n = flat_page.find('div', attrs={'class':'object_descr_title'})
rooms_n = html_stripper(rooms_n)
rooms_n
Out[613]:
In [614]:
re.split('-|\n', rooms_n)
Out[614]:
In [617]:
def getRoom(flat_page):
rooms_n = flat_page.find('div', attrs={'class':'object_descr_title'})
rooms_n = html_stripper(rooms_n)
room_number = ''
flag = 0
for i in re.split('-|\n', rooms_n):
if 'много' in i:
flag = 1
break
elif 'комн' in i:
break
else:
room_number += i
if (flag):
room_number = 'mult'
room_number = "".join(room_number.split())
return room_number
In [623]:
def getAllRooms(l, r):
rooms = []
for i in range(l, r):
flat_url = 'http://www.cian.ru/sale/flat/' + str(links[i]) + '/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
rooms.append(getRoom(flat_page))
return rooms
In [625]:
rooms = getAllRooms(0, len(links))
In [626]:
#flat_url = 'http://www.cian.ru/sale/flat/' + str(links[2]) + '/'
flat_url = 'http://www.cian.ru/sale/flat/150387502/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
In [629]:
metro = flat_page.find('div', attrs={'class':'object_descr_metro'})
metro = re.split('metro_name|мин', str(metro))
metro
Out[629]:
In [630]:
re.split('metro_name|мин', str(metro))
Out[630]:
In [640]:
def getMetroDistance(flat_page):
metro = flat_page.find('div', attrs={'class':'object_descr_metro'})
metro = re.split('metro_name|мин', str(metro))
if (len(metro) > 2): # если оба поля не были заполнены, то предыдущий сплит даст размерность 2
metro_dist = 0
power = 0
# кусок metro[1] после сплита будет содержать в конце кучу хлама, потом количество минут (если есть)
flag = 0
for i in range(0, len(metro[1])):
if metro[1][-i-1].isdigit():
flag = 1
metro_dist += int(metro[1][-i-1]) * 10 ** power
power += 1
elif (flag == 1):
break
else:
metro_dist = np.nan
return metro_dist
In [641]:
def getAllMetroDistances(l, r):
metro_distance = []
for i in range(l, r):
flat_url = 'http://www.cian.ru/sale/flat/' + str(links[i]) + '/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
metro_distance.append(getMetroDistance(flat_page))
return metro_distance
In [643]:
metro_distances = getAllMetroDistances(0, len(links))
In [644]:
def getMetroWalking(flat_page):
metro = flat_page.find('div', attrs={'class':'object_descr_metro'})
metro = re.split('metro_name|мин', str(metro))
if (len(metro) > 2): # если оба поля не были заполнены, то предыдущий сплит даст размерность 2
if 'пешк' in metro[2]:
walking = 1
elif 'машин' in metro[2]:
walking = 0
else:
walking = np.nan # да, проверка на то, отсутствовали ли оба поля была. мне лично не попадались ситуации, где бы не
# было заполнено только значение поля "пешком/на машине", но вдруг они есть? на такой случай проверка
else:
walking = np.nan
return walking
In [645]:
def getAllMetroWalking(l, r):
metro_walking = []
for i in range(l, r):
flat_url = 'http://www.cian.ru/sale/flat/' + str(links[i]) + '/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
metro_walking.append(getMetroWalking(flat_page))
return metro_walking
In [647]:
walking = getAllMetroWalking(0, len(links))
In [656]:
#flat_url = 'http://www.cian.ru/sale/flat/' + str(links[2]) + '/'
flat_url = 'http://www.cian.ru/sale/flat/150387502/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
table = flat_page.find('table', attrs = {'class':'object_descr_props'})
table = html_stripper(table)
table
Out[656]:
In [658]:
building_block = re.split('Этаж|Тип продажи', table)[1]
building_block
Out[658]:
In [694]:
def getBrick(flat_page):
table = flat_page.find('table', attrs = {'class':'object_descr_props'})
table = html_stripper(table)
brick = np.nan
building_block = re.split('Этаж|Тип продажи', table)[1]
if 'Тип дом' in building_block:
if (('кирпич' in building_block) | ('монолит' in building_block)):
brick = 1
elif (('панельн' in building_block) | ('деревян' in building_block) | ('сталин' in building_block) |
('блочн' in building_block)):
brick = 0
return brick
In [695]:
def getAllBricks(l, r):
bricks = []
for i in range(l, r):
flat_url = 'http://www.cian.ru/sale/flat/' + str(links[i]) + '/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
bricks.append(getBrick(flat_page))
return bricks
In [697]:
bricks = getAllBricks(0, len(links))
In [690]:
def getNew(flat_page):
table = flat_page.find('table', attrs = {'class':'object_descr_props'})
table = html_stripper(table)
new = np.nan
building_block = re.split('Этаж|Тип продажи', table)[1]
if 'Тип дом' in building_block:
if 'новостр' in building_block:
new = 1
elif 'втор' in building_block:
new = 0
return new
In [691]:
def getAllNew(l, r):
new = []
for i in range(l, r):
flat_url = 'http://www.cian.ru/sale/flat/' + str(links[i]) + '/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
new.append(getNew(flat_page))
return new
In [693]:
new = getAllNew(0, len(links))
In [698]:
#flat_url = 'http://www.cian.ru/sale/flat/' + str(links[2]) + '/'
flat_url = 'http://www.cian.ru/sale/flat/150387502/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
table = flat_page.find('table', attrs = {'class':'object_descr_props'})
table = html_stripper(table)
table
Out[698]:
In [699]:
building_block = re.split('Этаж|Тип продажи', table)[1]
building_block
Out[699]:
In [700]:
floor_block = re.split('\xa0/\xa0|\n|\xa0', building_block)
floor_block
Out[700]:
In [701]:
def getFloor(flat_page):
table = flat_page.find('table', attrs = {'class':'object_descr_props'})
table = html_stripper(table)
floor_is = 0
building_block = re.split('Этаж|Тип продажи', table)[1]
floor_block = re.split('\xa0/\xa0|\n|\xa0', building_block)
for i in range(1, len(floor_block[2]) + 1):
if(floor_block[2][-i].isdigit()):
floor_is += int(floor_block[2][-i]) * 10**(i - 1)
return floor_is
In [702]:
def getAllFloors(l, r):
floors = []
for i in range(l, r):
flat_url = 'http://www.cian.ru/sale/flat/' + str(links[i]) + '/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
floors.append(getFloor(flat_page))
return floors
In [704]:
floors = getAllFloors(0, len(links))
In [705]:
def getNFloor(flat_page):
table = flat_page.find('table', attrs = {'class':'object_descr_props'})
table = html_stripper(table)
floors_count = np.nan
building_block = re.split('Этаж|Тип продажи', table)[1]
floor_block = re.split('\xa0/\xa0|\n|\xa0', building_block)
if floor_block[3].isdigit():
floors_count = int(floor_block[3])
return floors_count
In [706]:
def getAllNFloors(l, r):
nfloors = []
for i in range(l, r):
flat_url = 'http://www.cian.ru/sale/flat/' + str(links[i]) + '/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
nfloors.append(getNFloor(flat_page))
return nfloors
In [708]:
nfloors = getAllNFloors(0, 20)
In [709]:
#flat_url = 'http://www.cian.ru/sale/flat/' + str(links[2]) + '/'
flat_url = 'http://www.cian.ru/sale/flat/150387502/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
table = flat_page.find('table', attrs = {'class':'object_descr_props'})
table = html_stripper(table)
table
Out[709]:
In [710]:
space_block = re.split('Общая площадь', table)[1]
space_block
Out[710]:
In [711]:
def myStrToFloat(string):
delimiter = 0
value = 0
for i in range(0, len(string)):
if string[i] == ',':
delimiter = i
for i in range(0, delimiter):
value += int(string[delimiter - i - 1]) * 10 ** i
for i in range(1, len(string) - delimiter):
value += (int(string[delimiter + i]) * (10 ** (i - 2)))
return value
In [715]:
def getTotsp(flat_page):
table = flat_page.find('table', attrs = {'class':'object_descr_props'})
table = html_stripper(table)
space_block = re.split('Общая площадь', table)[1]
total = re.split('Площадь комнат', space_block)[0]
total_space = re.split('\n|\xa0', total)[2]
if total_space.isdigit():
total_space = int(total_space)
else:
total_space = myStrToFloat(total_space)
return total_space
In [724]:
def getAllTotsp(l, r):
totsp = []
for i in range(l, r):
flat_url = 'http://www.cian.ru/sale/flat/' + str(links[i]) + '/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
totsp.append(getTotsp(flat_page))
return totsp
In [718]:
totsp = getAllTotsp(0, len(links))
In [719]:
def getLivesp(flat_page):
table = flat_page.find('table', attrs = {'class':'object_descr_props'})
table = html_stripper(table)
space_block = re.split('Общая площадь', table)[1]
living = re.split('Жилая площадь', space_block)[1]
living_space = re.split('\n|\xa0', living)[2]
if living_space.isdigit():
living_space = int(living_space)
else:
living_space = myStrToFloat(living_space)
return living_space
In [722]:
def getAllLivesp(l, r):
livesp = []
for i in range(l, r):
flat_url = 'http://www.cian.ru/sale/flat/' + str(links[i]) + '/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
livesp.append(getLivesp(flat_page))
return livesp
In [725]:
livesp = getAllLivesp(0, len(links))
In [726]:
#flat_url = 'http://www.cian.ru/sale/flat/' + str(links[2]) + '/'
flat_url = 'http://www.cian.ru/sale/flat/150387502/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
table = flat_page.find('table', attrs = {'class':'object_descr_props'})
table = html_stripper(table)
table
Out[726]:
In [727]:
space_block = re.split('Общая площадь', table)[1]
space_block
Out[727]:
In [728]:
optional_block = re.split('Жилая площадь', space_block)[1]
optional_block
Out[728]:
In [729]:
def getKitsp(flat_page):
table = flat_page.find('table', attrs = {'class':'object_descr_props'})
table = html_stripper(table)
space_block = re.split('Общая площадь', table)[1]
optional_block = re.split('Жилая площадь', space_block)[1]
kitchen_space = np.nan
if 'Площадь кухни' in optional_block:
kitchen_block = re.split('Площадь кухни', optional_block)[1]
if re.split('\n|\xa0', kitchen_block)[2] != '–':
if re.split('\n|\xa0', kitchen_block)[2].isdigit():
kitchen_space = int(re.split('\n|\xa0', kitchen_block)[2])
else:
kitchen_space = myStrToFloat(re.split('\n|\xa0', kitchen_block)[2])
return kitchen_space
In [730]:
def getAllKitsp(l, r):
kitsp = []
for i in range(l, r):
flat_url = 'http://www.cian.ru/sale/flat/' + str(links[i]) + '/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
kitsp.append(getKitsp(flat_page))
return kitsp
In [732]:
kitsp = getAllKitsp(0, len(links))
In [737]:
def getBal(flat_page):
table = flat_page.find('table', attrs = {'class':'object_descr_props'})
table = html_stripper(table)
space_block = re.split('Общая площадь', table)[1]
optional_block = re.split('Жилая площадь', space_block)[1]
balcony = np.nan
if 'Балкон' in optional_block:
balcony_block = re.split('Балкон', optional_block)[1]
if re.split('\n', balcony_block)[1] != 'нет':
if re.split('\n', balcony_block)[1] != '–':
balcony = int(re.split('\n', balcony_block)[1][0])
else:
balcony = 0
return balcony
In [738]:
def getAllBal(l, r):
bal = []
for i in range(l, r):
flat_url = 'http://www.cian.ru/sale/flat/' + str(links[i]) + '/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
bal.append(getBal(flat_page))
return bal
In [740]:
bal = getAllBal(0, len(links))
In [741]:
def getTel(flat_page):
table = flat_page.find('table', attrs = {'class':'object_descr_props'})
table = html_stripper(table)
space_block = re.split('Общая площадь', table)[1]
optional_block = re.split('Жилая площадь', space_block)[1]
telephone = np.nan
if 'Телефон' in optional_block:
telephone_block = re.split('Телефон', optional_block)[1]
if re.split('\n', telephone_block)[1] == 'да':
telephone = 1
elif re.split('\n', telephone_block)[1] == 'нет':
telephone = 0
return telephone
In [742]:
def getAllTel(l, r):
tel = []
for i in range(l, r):
flat_url = 'http://www.cian.ru/sale/flat/' + str(links[i]) + '/'
flat_page = requests.get(flat_url)
flat_page = flat_page.content
flat_page = BeautifulSoup(flat_page, 'lxml')
tel.append(getTel(flat_page))
return tel
In [744]:
tel = getAllTel(0, len(links))
In [745]:
N = []
for i in range(0, len(links)):
N.append(i)
In [746]:
district = []
for i in range(0, len(links)):
district.append('CAD')
In [808]:
data = dict([('New', new), ('Bal', bal), ('Tel', tel), ('Walk', walk), ('Metrdist', metrdist), ('Nfloors', nfloors), ('Floor', floor), ('Totsp', totsp), ('Livesp', livesp), ('Kitsp', kitsp), ('N', N), ('Price', prices), ('Rooms', rooms), ('Distance', distance), ('Brick', bricks), ('District', district)])
In [809]:
df = pd.DataFrame(data)
In [810]:
df.T
Out[810]:
In [811]:
df.to_csv('cian.csv', index=False)