In [1]:
import socket

mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

mysock.connect( ('www.vahidmirjalili.com', 80) )


mysock


Out[1]:
<socket.socket fd=47, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('192.168.1.6', 42808), raddr=('192.64.112.227', 80)>

In [ ]:
mysock.send('GET http://www/py4inf.com/code/romo.txt HTTP/1.0\n\n')

while True:
    data = mysock.recv(512)
    if len(data)<1:
        break
    print(data)
    
mysock.close()

In [2]:
import urllib.request
import re

fhand = urllib.request.urlopen('http://vahidmirjalili.com/about.html')

for line in fhand:
    line = line.decode().rstrip()
    if re.search('href=(\S)', line):
        print(line)

fhand.close()


    <strong><link rel="stylesheet" href="/static/css/main.css"></strong>
              <li><a href="/" class="item">Home</a></li>
              <li><a href="/tutorials.html" class="item">Tutorials</a></li>
              <li><a href="/ipynb.html" class="item">IPython notebooks</a></li>
              <li><a href="/about.html" class="item">About </a></li>
    <p >Vahid earned his Ph.D. in Mechanical Engineering, joint with Biochemistry & Molecular Biology from <a href='http://msu.edu'>Michigan State University.</a>
      <a href="mailto:vmirjalily@gmail.com?Subject=Hello%20again" target="_top"><img float:"left" src="/static/images/email-blue-icon.png" width="80" height="80" align="middle"></a>
      <a href="http://www.linkedin.com/pub/vahid-mirjalili/21/920/b49/"><img float:"left" src="/static/images/linkedin-logo.png" width="100" height="50" align="middle"></a>
      <a href="http://www.github.com/mirjalil"><img float:"left" src="/static/images/github-logo.jpg" width="80" height="70"></a>
      <a href="http://www.twitter.com/vmirly"><img float:"left" src="/static/images/twitter-logo.png" width="80" height="70"></a>
      <a href="http://scholar.google.com/citations?hl=en&user=8YTgJ_cAAAAJ"><img float:"left" src="/static/images/google_scholar.jpg" width="110" height="55"></a>
      <a href="https://plot.ly/~vmirjalily"><img float:"left" src="/static/images/plotly_logo.png" width="90" height="80"></a>

In [ ]:
import bs4

In [3]:
import urllib.request
import bs4

html = urllib.request.urlopen('http://vahidmirjalili.com/about.html').read()

soup = bs4.BeautifulSoup(html, 'lxml')

tags = soup('a')

for tag in tags:
    print(tag.get('href', None))


/
/tutorials.html
/ipynb.html
/about.html
http://msu.edu
mailto:vmirjalily@gmail.com?Subject=Hello%20again
http://www.linkedin.com/pub/vahid-mirjalili/21/920/b49/
http://www.github.com/mirjalil
http://www.twitter.com/vmirly
http://scholar.google.com/citations?hl=en&user=8YTgJ_cAAAAJ
https://plot.ly/~vmirjalily

In [4]:
link = 'http://detroit.craigslist.org/search/apa'

html = urllib.request.urlopen(link).read()

soup = bs4.BeautifulSoup(html, 'lxml')

tags = soup('a')

n = 0
for tag in tags:
    reslink = tag.get('href', None)
    if re.search('html$', reslink):
        print(reslink)
        n += 1
    if n>6:
        break


/okl/apa/5386682880.html
/okl/apa/5386682880.html
/wyn/apa/5348079924.html
/wyn/apa/5348079924.html
/wyn/apa/5349971461.html
/wyn/apa/5349971461.html
/mcb/apa/5386675272.html

In [5]:
mainlink = 'http://detroit.craigslist.org'

link = mainlink + reslink

print(link)
html = urllib.request.urlopen(link).read()

soup = bs4.BeautifulSoup(html, 'lxml')

tags = soup('img')

for tag in tags:
    print(tag.get('src', None))


http://detroit.craigslist.org/mcb/apa/5386675272.html
http://images.craigslist.org/01616_i4XlwKNsVAG_600x450.jpg
http://images.craigslist.org/01616_i4XlwKNsVAG_50x50c.jpg
http://images.craigslist.org/00505_5w3jYQ540FM_50x50c.jpg
http://images.craigslist.org/00n0n_jRBKqTIguMZ_50x50c.jpg
http://images.craigslist.org/00202_1NYe6rv5U6k_50x50c.jpg
http://images.craigslist.org/00o0o_8YyJ72SfBaX_50x50c.jpg
http://images.craigslist.org/00U0U_g5eRTudtnED_50x50c.jpg

Example 1: Get the sum of all numbers


In [16]:
import urllib.request
import bs4

url = 'http://python-data.dr-chuck.net/comments_219720.html'
html = urllib.request.urlopen(url).read()

soup = bs4.BeautifulSoup(html, "html.parser")

# Retrieve all of the anchor tags
tags = soup('span')
x = 0
for tag in tags:
    # Look at the parts of a tag
    #print ('TAG:',tag)
    #print ('URL:',tag.get('href', None))
    #print ('Contents:',tag.text)
    #print ('Attrs:',tag.attrs)
    x += int(tag.text)
    
print(x)


2649

In [24]:
import urllib.request
import bs4
import ssl

url = 'https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Huda.html'

scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)

for i in range(7):
    html = urllib.request.urlopen(url, context=scontext).read()
    soup = bs4.BeautifulSoup(html, "html.parser")

    # Retrieve all of the anchor tags
    tags = soup('a')
    url = tags[17].get('href', None)
    print(tags[17].get('href', None))


https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Annsarai.html
https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Hillary.html
https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Reean.html
https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Trudy.html
https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Penelope.html
https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Tammam.html
https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Farisya.html

In [ ]: