In [4]:
import re
# line.rstrip() to strip characters at the end

hand = open('mbox-short.txt')
for line in hand:
    line = line.rstrip()
    if re.search('^From:', line) :
        print line


  File "<ipython-input-4-6ce93c063507>", line 8
    print line
             ^
SyntaxError: Missing parentheses in call to 'print'

In [5]:
import re
x = 'My 2 favorite numbers are 19 and 42'
y = re.findall('[0-9]+', x)
print(y)


['2', '19', '42']

In [6]:
import re
x = 'My 2 favorite numbers are 19 and 42'
y = re.findall('[AEIOU]+', x)
print(y)


[]

In [16]:
import re
line = 'From stephen.marquard@uct.ac.za Sat Jan  5 09:14:16 2008'

x = re.findall('..@\S+..', line)
print(x)

y = re.findall('@\S+', line)
print(y)

z = re.findall('F.+:', line)
print(z)

r = re.findall('@(\S+)', line)
print(r)


['rd@uct.ac.za S']
['@uct.ac.za']
['From stephen.marquard@uct.ac.za Sat Jan  5 09:14:']
['uct.ac.za']

In [18]:
x = 'From: Using the : character'
y = re.findall('^F.+:', x)
print(y)


['From: Using the :']

In [19]:
import re
line = 'From stephen.marquard@uct.ac.za Sat Jan  5 09:14:16 2008'
x = re.findall('\S+?@\S+', line)
print(x)


['stephen.marquard@uct.ac.za']

Python Regular Expression Quick Guide

^ Matches the beginning of a line

$ Matches the end of the line

. Matches any character

\s Matches whitespace

\S Matches any non-whitespace character

  • Repeats a character zero or more times

*? Repeats a character zero or more times(non-greedy)

  • Repeats a character one or more times

+? Repeats a character one or more times(non-greedy)

[aeiou] Matches a single character in the listed set

[^XYZ] Matches a single character not in the listed set

[a-z0-9] The set of characters can include a range

( Indicates where string extraction is to start

) Indicates where string extraction is to end

Trying to solve the assignment with the sample data


In [58]:
# using the sample data to verify the code

import urllib.request
import re

sample_url = 'http://python-data.dr-chuck.net/regex_sum_42.txt'

with urllib.request.urlopen(sample_url) as response:
   html = response.read()

html_string = str(html)
numbers = re.findall('[0-9]+', html_string)

intlist = [int(x) for x in numbers]

len(intlist)
sum(intlist)


Out[58]:
445822

In [61]:
# using the actual data to calculate the answer

# import libraries
import urllib.request
import re

# name the URL
actual_url = 'http://python-data.dr-chuck.net/regex_sum_371509.txt'

# use the most basic of reading methods shown at the urllib.request project page
with urllib.request.urlopen(actual_url) as response:
   html = response.read()

# make the output into a string so that regex can be used
html_string = str(html)

# find all numbers with regex and put them into a list
numbers = re.findall('[0-9]+', html_string)

# transform the list entries to integers, one by one
intlist = [int(x) for x in numbers]

# calculate the sum of the integers in the list
#len(intlist)
sum(intlist)


Out[61]:
413003

In [62]:
# making a one-liner for fun
import urllib.request
import re

actual_url = 'http://python-data.dr-chuck.net/regex_sum_371509.txt'

with urllib.request.urlopen(actual_url) as response:
   html = response.read()

# here comes the line
sum([int(x) for x in re.findall('[0-9]+', str(html))])


Out[62]:
413003

In [ ]:


In [ ]:


In [ ]:


In [ ]: