In [118]:
import sys, requests, os, re, datetime
from bs4 import BeautifulSoup
# This is where the files are hosted
url= "http://www.leboncoin.fr/informatique/offres/ile_de_france/?ps=4&pe=8&q=thinkpad&th=1"
#inner_urls = []
#this'll be an array of dicionaries
Designed for computers, but could be used for anything I suppose with the right link Inside the link parameters:
In [6]:
def scrape_listings(url):
computers = []#set()# define array of listings
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
ordinateurs = soup.find_all("div", {"class":"lbc"}) # These are the unsifted listings
x = 0 # Counter, Why not?
for ordinateur in ordinateurs:
url = ordinateur.find_parent('a')['href']
#images need to be something like, if in class=image div, find img element
image_url = "" #ordinateur.find("img")['src'].text
date = ordinateur.find("div", {'class':'date'}).text.replace('\n'," ").strip()
details = ordinateur.find('div', {'class':'detail'})
title = str(details.find('h2', {'class':'title'}).string).replace('\n', '').strip()
category = str(details.find('div', {'class':'category'}).string).replace('\n', '').strip()
placement = str(details.find('div', {'class':'placement'}).string).replace('\n', '').strip().replace(' / '," ")
price = str(details.find('div', {'class':'price'}).string).replace('\n', '').strip()
## Images are a bit more complicated:
try:
image_div = ordinateur.find('div', {'class':'image'})
image_nb = image_div.find('div', {'class':'image-and-nb'})
image_url = image_nb.find('img')['src']
except:
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/a/ac/No_image_available.svg/200px-No_image_available.svg.png" # No-image image
# The ordinateur has english attributes
# et le computer avais les francais
computer = {'url':url, 'titre': title, 'prix': price,'location': placement, 'image': image_url, 'date':date, 'category': category }
#computer_set = {tuple(sorted(computer.items()))}
# computer is a dictionary, so far lacking the actual url
computers.append(computer)
#x += 1
#computers = computers | computer_set
#print(x)
return computers
def get_recent(computers): # get the computers listed TODAY
recent_computers = []
for computer in computers:
today = str(datetime.datetime.now().day) + '-' + str(datetime.datetime.now().month)
c = re.compile('Aujourd\'hui')
if c.match(computer['date']):
computer['date'] = computer['date'].replace('Aujourd\'hui', today)
computer = tuple(sorted(computer))
recent_computers.append(computer)
return recent_computers
In [66]:
computers = scrape_listings(url)
len(computers)
Out[66]:
In [16]:
computer = computers[0]
computer.items()
print(sorted(computer.items()))
comp_set_0= tuple(sorted(computer.items()))
computer = computers[1]
computer.items()
print(sorted(computer.items()))
comp_set_1= tuple(sorted(computer.items()))
computer = computers[2]
computer.items()
print(sorted(computer.items()))
comp_set_2= tuple(sorted(computer.items()))
computer = computers[3]
computer.items()
print(sorted(computer.items()))
comp_set_3= tuple(sorted(computer.items()))
computer = computers[4]
computer.items()
print(sorted(computer.items()))
comp_set_3= tuple(sorted(computer.items()))
In [68]:
l_tuples = []
for x in computers:
tup = tuple(sorted(x.items()))
l_tuples.append(tup)
In [70]:
l_tuples[0]
Out[70]:
In [104]:
In [112]:
a_set = set()
b_set = set()
b_set.add(l_tuples[2]) # b is the total
b_set.add(l_tuples[3])
b_set.add(l_tuples[4])
b_set.add(l_tuples[5])
b_set.add(l_tuples[6])
b_set.add(l_tuples[7])
a_set.add(l_tuples[0]) # latest entries
a_set.add(l_tuples[1])
a_set.add(l_tuples[2])
a_set.add(l_tuples[3])
c_set = a_set - b_set # take the items unique to a
# c is what we want to print
# must add it to the total set
b_set = a_set
print(len(list(c_set)))
print(len(list(b_set)))
#final_set = a_set - b_set
#print("\n")
#print(a_set)
#print("\n")
#print(b_set)
#print(c_set)
In [107]:
Out[107]:
In [121]:
zero = list(c_set)[0]
one = list(c_set)[1]
len(c_set)
type(l_s)
#dict(l_s)
l_s
Out[121]:
In [110]:
dict(zero)
Out[110]:
In [40]:
a_set
Out[40]:
In [42]:
a_set[1]
In [37]:
type(a_set)
Out[37]:
In [27]:
for item in computers:
if item in new_array:
print(item)
In [29]:
for item in computers:
if item in computers:
print(item)
In [30]:
for item in computers:
if item in new_array:
new_array.remove(item)
In [31]:
new_array
Out[31]:
In [ ]: