In [1]:
# code for loading the format for the notebook
import os
# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir(os.path.join('..', '..', 'notebook_format'))
from formats import load_style
load_style(plot_style=False)
Out[1]:
In [2]:
os.chdir(path)
# magic to print version
%load_ext watermark
%watermark -a 'Ethen' -d -t -v
Some of the materials are a condensed reimplementation from the resource: Python3 Cookbook Chapter 2. Strings and Text, which originally was freely available online.
In [3]:
import re
line = 'asdf fjdk; afed, fjek,asdf, foo'
re.split(r'[;,\s]\s*', line)
Out[3]:
In [4]:
filenames = ['Makefile', 'foo.c', 'bar.py', 'spam.c', 'spam.h']
# pass in a tuple for multiple match, must be tuple, list won't work
print([name for name in filenames if name.endswith(('.c', '.h'))])
print(any(name.endswith('.py') for name in filenames))
In [5]:
from fnmatch import fnmatchcase
addresses = [
'5412 N CLARK ST',
'1060 W ADDISON ST',
'1039 W GRANVILLE AVE',
'2122 N CLARK st',
'4802 N BROADWAY']
[addr for addr in addresses if fnmatchcase(addr, '* ST')]
Out[5]:
In [6]:
text = 'yeah, but no, but yeah, but no, but yeah'
text.find('no')
Out[6]:
Example2: Match a lot of the same complex pattern, it's better to precompile the regular expression pattern first using re.compile()
.
In [7]:
import re
text1 = '11/27/2012'
text2 = 'Nov 27, 2012'
# Simple matching: \d+ means match one or more digits
# the 'r' simply means raw strings, this leaves the backslash (\)
# uninterpretted, or else you'll have to use \\ to match special characters
if re.match(r'\d+/\d+/\d+', text1):
print('yes')
else:
print('no')
if re.match(r'\d+/\d+/\d+', text2):
print('yes')
else:
print('no')
# the re.compile version
datepat = re.compile(r'\d+/\d+/\d+')
if datepat.match(text1):
print('yes')
else:
print('no')
if datepat.match(text2):
print('yes')
else:
print('no')
Example3: Find all occurences in the text instead of just the first one with findall()
.
In [8]:
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
datepat.findall(text)
Out[8]:
Example4: Capture groups by enclosing the pattern in parathensis.
In [9]:
# single match
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
m = datepat.match('11/27/2012')
print(m.groups())
print(m.group(1))
In [10]:
# mutiple match
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
print(datepat.findall(text))
print(re.findall(r'(\d+)/(\d+)/(\d+)', text)) # for matching just once
for month, day, year in datepat.findall(text):
print('{}-{}-{}'.format(year, month, day))
In [11]:
# return a iterator instead of a list
for m in datepat.finditer(text):
print(m.groups())
In [12]:
text = 'yeah, but no, but yeah, but no, but yeah'
text.replace('yeah', 'yep')
Out[12]:
Example2: More complex replace using re.sub()
. The nackslashed digits refers to the matched group.
In [13]:
import re
# replace date from d/m/Y to Y-m-d
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text)
Out[13]:
Example3: Define a function for the substitution.
In [14]:
import re
from calendar import month_abbr
def change_date(m):
# place in the matched pattern and return the replaced text
mon_name = month_abbr[ int(m.group(1)) ]
return '{} {} {}'.format(m.group(2), mon_name, m.group(3))
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
datepat.sub(change_date, text)
Out[14]:
Example4: Use .subn()
to replace and return the number of substitution made.
In [15]:
newtext, n = datepat.subn(r'\3-\1-\2', text)
print(newtext)
print(n)
Example5: supply the re.IGNORECASE
flag if you want to ignore cases.
In [16]:
text = 'UPPER PYTHON, lower python, Mixed Python'
re.findall('python', text, flags = re.IGNORECASE)
Out[16]:
In [17]:
# white space stripping
s = ' hello world \n'
print(s.strip())
# character stripping
t = '-----hello world====='
print(t.strip('-='))
In [18]:
"""
with open(filename) as f:
lines = (line.strip() for line in f)
for line in lines:
"""
print('Generator Expression can be useful when you want to perform other operations after stripping')
In [19]:
intab = 'aeiou'
outtab = '12345'
# maps the character a > 1, e > 2
trantab = str.maketrans(intab, outtab)
str = 'this is string example....wow!!!'
print(str.translate(trantab))
In [20]:
parts = ['Is', 'Chicago', 'Not', 'Chicago?']
print(' '.join(parts))
Example2: Don't use the +
operator when unneccessary.
In [21]:
a = 'Is Chicago'
b = 'Not Chicago?'
print(a + ' ' + b)
print(a, b, sep = ' ')
In [22]:
s = '{name} has {n} messages.'
s.format(name = 'Guido', n = 37)
Out[22]:
In [23]:
import os
import textwrap
s = "Look into my eyes, look into my eyes, the eyes, the eyes, \
the eyes, not around the eyes, don't look around the eyes, \
look into my eyes, you're under."
print(textwrap.fill(s, 40))
# if you want to get the text to match the terminal size
print(os.get_terminal_size().columns)