In [4]:
import re
# line.rstrip() to strip characters at the end
hand = open('mbox-short.txt')
for line in hand:
line = line.rstrip()
if re.search('^From:', line) :
print line
In [5]:
import re
x = 'My 2 favorite numbers are 19 and 42'
y = re.findall('[0-9]+', x)
print(y)
In [6]:
import re
x = 'My 2 favorite numbers are 19 and 42'
y = re.findall('[AEIOU]+', x)
print(y)
In [16]:
import re
line = 'From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'
x = re.findall('..@\S+..', line)
print(x)
y = re.findall('@\S+', line)
print(y)
z = re.findall('F.+:', line)
print(z)
r = re.findall('@(\S+)', line)
print(r)
In [18]:
x = 'From: Using the : character'
y = re.findall('^F.+:', x)
print(y)
In [19]:
import re
line = 'From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'
x = re.findall('\S+?@\S+', line)
print(x)
^ Matches the beginning of a line
$ Matches the end of the line
. Matches any character
\s Matches whitespace
\S Matches any non-whitespace character
*? Repeats a character zero or more times(non-greedy)
+? Repeats a character one or more times(non-greedy)
[aeiou] Matches a single character in the listed set
[^XYZ] Matches a single character not in the listed set
[a-z0-9] The set of characters can include a range
( Indicates where string extraction is to start
) Indicates where string extraction is to end
In [58]:
# using the sample data to verify the code
import urllib.request
import re
sample_url = 'http://python-data.dr-chuck.net/regex_sum_42.txt'
with urllib.request.urlopen(sample_url) as response:
html = response.read()
html_string = str(html)
numbers = re.findall('[0-9]+', html_string)
intlist = [int(x) for x in numbers]
len(intlist)
sum(intlist)
Out[58]:
In [61]:
# using the actual data to calculate the answer
# import libraries
import urllib.request
import re
# name the URL
actual_url = 'http://python-data.dr-chuck.net/regex_sum_371509.txt'
# use the most basic of reading methods shown at the urllib.request project page
with urllib.request.urlopen(actual_url) as response:
html = response.read()
# make the output into a string so that regex can be used
html_string = str(html)
# find all numbers with regex and put them into a list
numbers = re.findall('[0-9]+', html_string)
# transform the list entries to integers, one by one
intlist = [int(x) for x in numbers]
# calculate the sum of the integers in the list
#len(intlist)
sum(intlist)
Out[61]:
In [62]:
# making a one-liner for fun
import urllib.request
import re
actual_url = 'http://python-data.dr-chuck.net/regex_sum_371509.txt'
with urllib.request.urlopen(actual_url) as response:
html = response.read()
# here comes the line
sum([int(x) for x in re.findall('[0-9]+', str(html))])
Out[62]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: