C2PythonCookBook
# set jupyter notebook theme
# jt -t monokai -f roboto -fs 12 -ofs 11 -N -T
# -T, --toolbar make toolbar visible
# -N, --nbname nb name/logo visible
2.1. Splitting Strings on Any of Multiple Delimiters
# re.split()
line = 'asdf fjdk; afed, fjek,asdf, foo'
# reformed using delimiters 'asdf fjdk;afed,fjek,asdf,foo'
import re
fields = re.split(r'[;,\s]\s*',line)
print(type(fields))
print(fields)
# caputure group in parenthese, the matched text also included in the result
print("use caputure group:")
# it seems that caputure group will ignore the extra whitespace
print(re.split(r'(;|,|\s)\s*', line)) # notice '|', different from [regex]
# use the split characters later on to reform an output string
# fields = re.split(r'(;,\s)\s*',line) WRONG OUTPUT
fields = re.split(r'(;|,|\s)\s*', line)
print(fields)
values = fields[0::2]
delimiters = fields[1::2]
# match elements' number of values
delimiters.append("")
print(values,delimiters)
print(len(values),len(delimiters))
output = ""
# CookBook way
print(''.join(v+d for v,d in zip(values, delimiters)))
# my way
for v,d in zip(values,delimiters):
output += v + d
print(output)
# noncapture group
print("noncaputure group:")
print(re.split(r'(?:;|,|\s)\s*',line)) # (?:regex)
<class 'list'>
['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']
use caputure group:
['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo']
['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo']
['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo'] [' ', ';', ',', ',', ',', '']
6 6
asdf fjdk;afed,fjek,asdf,foo
asdf fjdk;afed,fjek,asdf,foo
noncaputure group:
['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']
2.2. Matching Text at the Start or End of a String
# str.endswith() str.startswith()
filename = '1.txt'
print(filename.endswith('.txt'))
print(filename.startswith('2'))
urlname = 'http://www.python.org'
print(urlname.startswith('http:'))
# another example, provide a tuple of possibilities
import os
filenames = os.listdir('D:\LinuxShare\lab1\.')
print(filenames)
# endswith_c_or_o = [name for name in filenames if name.endswith('.c','.o')]
# TypeError: slice indices must be integers or None or have an __index__ method
# Reason: parenthenes lost near all the choices
endswith_c_or_o = [name for name in filenames if name.endswith(('.c','.o'))]
print(endswith_c_or_o)
# must use a tuple, not a list or set
choices = ['http:','ftp:']
# print(urlname.startswith(choices))
# TypeError: startswith first arg must be str or a tuple of str, not list
print(urlname.startswith(tuple(choices)))
# use slice to do basic prefix and suffix checking
print("use slice to check suffix:")
print(filename[-4:] == '.txt')
print(urlname[:5] == 'http:' or urlname[:6] == 'https:' or urlname[:4] == 'ftp:')
# use regular expression to check
import re
# overkill for simple matching
print(re.match('http:|https:|ftp:',urlname))
True
False
True
['g.c', 'g1.c', 'g2.c', 'hello.c', 'hello.i', 'hello.o', 'hello.out', 'hello.s', 'show_byte.c', 'sizeof.c']
['g.c', 'g1.c', 'g2.c', 'hello.c', 'hello.o', 'show_byte.c', 'sizeof.c']
True
use slice to check suffix:
True
True
<re.Match object; span=(0, 5), match='http:'>
2.3. Matching Strings Using Shell Wildcard Patterns
# from fnmatch import fnmatch, fnmatchcase
from fnmatch import fnmatch, fnmatchcase
print(fnmatch('foo.txt','*.txt'))
print(fnmatch('foo.txt','?oo.txt'))
print(fnmatch('Data45.csv','Data[0-9][0-9]*'))
names = ['Dat1.csv', 'Dat2.csv', 'config.ini', 'foo.py']
csvfile = [name for name in names if fnmatch(name,'*.csv')]
print(csvfile)
## fnmatch() matches patterns using the same case-sensitivity rules as the sys‐
## tem’s underlying filesystem
# on Windows
print("for matching 'foo.txt','*.TXT':")
print("fnmatch on Windows:",fnmatch('foo.txt','*.TXT'))
# exactly case sensitive, fnmatchcase
print("fnmatchcase everywhere:",fnmatchcase('foo.txt','*.TXT'))
## data processing of nonfilename strings
addresses = [
'5412 N CLARK ST',
'1060 W ADDISON ST',
'1039 W GRANVILLE AVE',
'2122 N CLARK ST',
'4802 N BROADWAY',
]
allST = [addr for addr in addresses if fnmatchcase(addr,'*ST')]
print("allST:",allST)
_54CLARK = [addr for addr in addresses if fnmatchcase(addr,'54*CLARK*')]
print("_54CLARK:",_54CLARK)
## fnmatch sits between the functionality of simple string methods and
## the full power of regular expression
True
True
True
['Dat1.csv', 'Dat2.csv']
for matching 'foo.txt','*.TXT':
fnmatch on Windows: True
fnmatchcase everywhere: False
allST: ['5412 N CLARK ST', '1060 W ADDISON ST', '2122 N CLARK ST']
_54CLARK: ['5412 N CLARK ST']
2.4. Matching and Searching for Text Patterns
# str.find():find location; ==:exact match; starts|endswith
text = 'yeah, but no, but yeah, but no, but yeah'
str1 = '0123456789012345'
# Search for the location of the first occurrence
print(text.find('no'))
# re.match
import re
text1 = '07/04/2022'
text2 = 'July 04, 2022'
print(re.match(r'\d+/\d+/\d+',text1))#r before 'regex' is optional
print(re.match('\d+/\d+/\d+',text2))
# Do a lot of matches using the same pattern, precompile the regular expression pattern
print("compile regular expression pattern to match:")
datepattern = re.compile('\d+/\d+/\d+')
def match_date(text):
if datepattern.match(text):
print("yes")
else:
print("no")
match_date(text1)
match_date(text2)
text3 = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
print("text3:",text3)
print("all dates in text3:",datepattern.findall(text3))
# introduce capture groups by enclosing parts of the pattern in parentheses
datepattern_group = re.compile('(\d+)/(\d+)/(\d+)')
m = datepattern_group.match(text1)
print("capture groups:",m)
# Extract the contents of each group
print("the contents of m:")
print("m.group():",m.group())
print("m.groups():",m.groups())
print("len(m.groups()):",len(m.groups()))
for i in range(0,len(m.groups()) + 1):
print("m.group({}):{}".format(i,m.group(i)))
# Find all matches (notice splitting into tuples)
havefindall = datepattern_group.findall(text3)
print("datepattern_group findall:",havefindall)
for month,day,year in havefindall:
print("{}-{}-{}".format(year,month,day))
# find matches iteratively, use the finditer() method
print("finditer():")
for m in datepattern_group.finditer(text3):
print("m.groups():",m.groups())
## the match() method only checks the beginning of a string.
## It’s possible that it will match things you aren’t expecting. For example:
text4 = '11/27/2012abcdef'
print("match something strange:")
print("text4:",text4)
print("not using an end-marker($):")
m = datepattern_group.match(text4)
print("m:",m)
print("m.group():",m.groups())
## If you want an exact match, make sure the pattern includes the end-marker ($),
date_end_marker = re.compile('(\d+)/(\d+)/(\d+)$')
print("using an end-marker($):")
m = date_end_marker.match(text4)
print("m:",m)
# print("m.group():",m.groups())
# AttributeError: 'NoneType' object has no attribute 'groups'
10
<re.Match object; span=(0, 10), match='07/04/2022'>
None
compile regular expression pattern to match:
yes
no
text3: Today is 11/27/2012. PyCon starts 3/13/2013.
all dates in text3: ['11/27/2012', '3/13/2013']
capture groups: <re.Match object; span=(0, 10), match='07/04/2022'>
the contents of m:
m.group(): 07/04/2022
m.groups(): ('07', '04', '2022')
len(m.groups()): 3
m.group(0):07/04/2022
m.group(1):07
m.group(2):04
m.group(3):2022
datepattern_group findall: [('11', '27', '2012'), ('3', '13', '2013')]
2012-11-27
2013-3-13
finditer():
m.groups(): ('11', '27', '2012')
m.groups(): ('3', '13', '2013')
match something strange:
text4: 11/27/2012abcdef
not using an end-marker($):
m: <re.Match object; span=(0, 10), match='11/27/2012'>
m.group(): ('11', '27', '2012')
using an end-marker($):
m: None
2.5. Searching and Replacing Text
# .replace()
print("str.replace():")
text1 = 'yeah, but no, but yeah, but no, but yeah'
print(text1)
print(text1.replace('yeah','yep'))
# re.sub('regex1','regex2',text)
import re
print("re.sub():")
text2 = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
print(text2)
print(re.sub(r'(\d+)/(\d+)/(\d+)',r'\3-\1-\2',text2))
# pattern.sub('regex',text)
print("pattern.sub('regex',text):")
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
print(datepat.sub(r'\3-\1-\2',text2))
## specify a substitution callback function instead
# pattern.sub(function,text)
from calendar import month_abbr
def change_date(m):
mon = month_abbr[int(m.group(1))]
return "{} {} {}".format(m.group(2),mon,m.group(3))#day,mon,year
print("pattern.sub(function,text):")
print(datepat.sub(change_date,text2))
# pattern.subn('regex',text) can show how many substitutions were made
print("pattern.subn('regex',text):")
newtext2, n = datepat.subn(r'\3-\1-\2',text2)
print("there are {} substitutions.".format(n))
print("after substitution, text2 is:\n{}".format(newtext2))
str.replace():
yeah, but no, but yeah, but no, but yeah
yep, but no, but yep, but no, but yep
re.sub():
Today is 11/27/2012. PyCon starts 3/13/2013.
Today is 2012-11-27. PyCon starts 2013-3-13.
pattern.sub('regex',text):
Today is 2012-11-27. PyCon starts 2013-3-13.
pattern.sub(function,text):
Today is 27 Nov 2012. PyCon starts 13 Mar 2013.
pattern.subn('regex',text):
there are 2 substitutions.
after substitution, text2 is:
Today is 2012-11-27. PyCon starts 2013-3-13.
?2.6. Searching and Replacing Case-Insensitive Text
?question about matchcase(word) function in code block 2.
##### FROM NOW ON, SPILT PER TEST CODE BLOCK
# flags=re.IGNORECASE
text = 'UPPER PYTHON, lower python, MixEd PythOn'
print("all python:")
print(re.findall('python',text,flags=re.IGNORECASE))
print("substitute 'python' with 'monkey':")
print(re.sub('python','monkey',text,flags=re.IGNORECASE))
all python:
['PYTHON', 'python', 'PythOn']
substitute 'python' with 'monkey':
UPPER monkey, lower monkey, MixEd monkey
# use a support function to fix: replacing text matches the case of the matched text
print("use a support function to fix: replacing text\
matches the case of the matched text:")
### SOME QUESTIONS ABOUT THIS FUNCTION
def matchcase(word):
def replace(m):
text = m.group()
if text.isupper():
return word.upper()
elif text.islower():
return word.lower()
elif text[0].isupper():
return word.capitalize()
else:
return word
return replace
print(text)
print(re.sub('python',matchcase('monkey'),text,flags=re.IGNORECASE))
use a support function to fix: replacing textmatches the case of the matched text:
UPPER PYTHON, lower python, MixEd PythOn
UPPER MONKEY, lower monkey, MixEd Monkey
2.7. Specifying a Regular Expression for the Shortest Match
# illustrate the problem
str_pat = re.compile(r'\"(.*)\"')
text1 = 'Computer says "no."'
text2 = 'Computer says "no." Phone says "yes."'
print("try to find all quoted string:")
print(text1)
print(str_pat.findall(text1))
# this goes wrong when matching text2
print("However, this goes wrong when matching text2:")
print(text2)
print(str_pat.findall(text2))
# add ? modifier after the * operator to make the matching nongreedy
str_pat_nongreedy = re.compile(r'\"(.*?)\"')
print("add ? modifier after the * operator to fix:")
print(text2)
print(str_pat_nongreedy.findall(text2))
try to find all quoted string:
Computer says "no."
['no.']
However, this goes wrong when matching text2:
Computer says "no." Phone says "yes."
['no." Phone says "yes.']
add ? modifier after the * operator to fix:
Computer says "no." Phone says "yes."
['no.', 'yes.']
?2.8. Writing a Regular Expression for Multiline Patterns
should know more about regular expression, such as '?:'; group; '*?'; greedy & nongreedy
# dot(.) doesn't match '\n'
text1 = '/* this is a comment */'
text2 = '''/* this is a
multiline comment */'''
comment = re.compile(r'/\*(.*?)\*/')
print("text1:",text1)
print(comment.findall(text1))
print("dot(.) doesn't match '\\n':")
print("text2:")
print(text2)
print(comment.findall(text2))
# to fix this, modify the regex
# my modified version, some wrong(a ' ' is also matched), I don't know why.
comment_v2 = re.compile(r'/\*((.|\n)*?)\*/')
print("to fix this, modify the regex")
print(comment_v2.findall(text2))
comment_v3 = re.compile(r'/\*(?:(.|\n)*?)\*/')
comment_v4 = re.compile(r'/\*((?:.|\n)*?)\*/')
print(comment_v3.findall(text2))
print(comment_v4.findall(text2))
# another way is to use re.DOTALL as flags when compile a pattern
print("another way is to use re.DOTALL as flags:")
comment_flag = re.compile(r'/\*(.*?)\*/',flags=re.DOTALL)
print(comment_flag.findall(text2))
text1: /* this is a comment */
[' this is a comment ']
dot(.) doesn't match '\n':
text2:
/* this is a
multiline comment */
[]
to fix this, modify the regex
[(' this is a\n multiline comment ', ' ')]
[' ']
[' this is a\n multiline comment ']
another way is to use re.DOTALL as a flag:
[' this is a\n multiline comment ']
2.9. Normalizing Unicode Text to a Standard Representation
# some characters have more than one valid sequence of code points
s1 = 'Spicy Jalape\u00f1o'
s2 = 'Spicy Jalapen\u0303o'
print(s1)
print(s2)
print("s1 == s2?",s1 == s2)
print("len(s1):{}, len(s2):{}".format(len(s1),len(s2)))
Spicy Jalapeño
Spicy Jalapeño
s1 == s2? False
len(s1):14, len(s2):15
# using unicodedata module to fix this problem
print("using unicodedata module to fix this problem")
import unicodedata
## C for composed ; D for decomposed
t1 = unicodedata.normalize('NFC',s1)
t2 = unicodedata.normalize('NFC',s2)
print("t1:",t1)
print("t2:",t2)
print("after normalization(NFC), t1 == t2?",t1 == t2)
print("ascii(t1):",ascii(t1))
print("ascii(t2):",ascii(t2))
t3 = unicodedata.normalize('NFD',s1)
t4 = unicodedata.normalize('NFD',s2)
print("t3:",t3)
print("t4:",t4)
print("after normalization(NFD), t3 == t4?",t3 == t4)
print("ascii(t3):",ascii(t3))
print("ascii(t4):",ascii(t4))
using unicodedata module to fix this problem
t1: Spicy Jalapeño
t2: Spicy Jalapeño
after normalization(NFC), t1 == t2? True
ascii(t1): 'Spicy Jalape\xf1o'
ascii(t2): 'Spicy Jalape\xf1o'
t3: Spicy Jalapeño
t4: Spicy Jalapeño
after normalization(NFD), t3 == t4? True
ascii(t3): 'Spicy Jalapen\u0303o'
ascii(t4): 'Spicy Jalapen\u0303o'
# see how the combined letters are broken apart here
print("see how the combined letters are broken apart here")
s = '\ufb01'
print("s:",s)
print("NFD of s:",unicodedata.normalize('NFD', s))
print("NFKD of s:",unicodedata.normalize('NFKD', s))
print("NFKC of s:",unicodedata.normalize('NFKC', s))
see how the combined letters are broken apart here
s: fi
NFD of s: fi
NFKD of s: fi
NFKC of s: fi
# remove all diacritical marks from some text
t1 = unicodedata.normalize('NFD', s1)
print("decomposed:")
print(''.join(c for c in t1 if not unicodedata.combining(c)))
print("composed:")
t1 = unicodedata.normalize('NFC', s1)
print(''.join(c for c in t1 if not unicodedata.combining(c)))
decomposed:
Spicy Jalapeno
composed:
Spicy Jalapeño
2.10. Working with Unicode Characters in Regular Expressions
# \d already matches any unicode digit character
import re
num = re.compile(r'\d+')
ascii_digits = '123'
print("ascii digits:",ascii_digits)
print("match ascii digits:",num.match(ascii_digits))
arabic_digits = '\u0661\u0662\u0663'
print("arabic_digits:",arabic_digits)
print("match arabic digits:",num.match(arabic_digits))
# here is a regex that matches Arabic code pages
arabic_regex = re.compile('[\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff]+')
ascii digits: 123
match ascii digits: <re.Match object; span=(0, 3), match='123'>
arabic_digits: ١٢٣
match arabic digits: <re.Match object; span=(0, 3), match='١٢٣'>
# caseinsensitive matching combined with case folding
strange = 'stra\u00dfe'
pat = re.compile('stra\u00dfe',re.IGNORECASE)
print("a strange str:", strange)
print("match this str:",pat.match(strange))
print("change to upper, don't match:")
print("strange.upper():",strange.upper())
# case folds
print("don't match this str:",pat.match(strange.upper()))
a strange str: straße
match this str: <re.Match object; span=(0, 6), match='straße'>
change to upper, don't match:
strange.upper(): STRASSE
don't match this str: None
2.11. Stripping Unwanted Characters from Strings
# Whitespace stripping
s = ' hello world \n'
s
' hello world \n'
s.strip()
'hello world'
s.lstrip()
'hello world \n'
s.rstrip()
' hello world'
# Character stripping
t = '-----hello====='
t
'-----hello====='
t.strip('-')
'hello====='
t.lstrip('-')
'hello====='
t.rstrip('-')
'-----hello====='
t.strip('-=')
'hello'
## stripping does not apply to any text in the middle of a string.
s = ' hello world \n'
s = s.strip()
s
'hello world'
## if you want to do something to the inner space, use replace(), regex
s.replace(' ','')
'helloworld'
import re
re.sub(r'\s+',' ',s)
'hello world'
?2.12. Sanitizing and Cleaning Up Text
1.questions about \t \r when using together
2.ord() unicodedata.digit()| .category()
3.chr()
# https://www.pythonpool.com/carriage-return-python/
# notice that there is a whitespce after 'Latracal'
string = 'My web\nsite is Latracal \rSolution'
print(string)
My web
Solution
## \t means 8 spces, but 'Latracal' will not be replaced by 'solution' as the website
## I don't know why
str = ('\tLatracal \rsolution')
print(str)
str1 = '\t'
str2 = ' '
str3 = ' '
print('|'+str1+'|'+str2+'|'+str3+'|')
print(str1 == str2)
print(str1 == str3)
print("\t1234567890\rsolution")
# it seems that 'solution' will replace \t and beginning 7 letters(after \t)
solution
| | | |
False
False
solution
str = ('\tLatracal \rsolution')
print(str)
solution
There are some differences between jupyter notebook and CMD when running:
str = ('\tLatracal \rsolution')
print(str)
My jupyter notebook's output:(see my attached image)
solutionl
But in CMD, running the same code, I got the same output as your blog(https://www.pythonpool.com/carriage-return-python/), which is different from jupyter notebook:
C:\Users\Matrix>python
Python 3.9.12 (tags/v3.9.12:b28265d, Mar 23 2022, 23:52:46) [MSC v.1929 64 bit (AMD64)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> str = ('\tLatracal \rsolution')
>>> print(str)
solutionLatracal
>>> str
'\tLatracal \rsolution'
This is the version information about my jupyter notebook:
C:\Users\Matrix>jupyter --version
jupyter core : 4.7.1
jupyter-notebook : 6.4.3
qtconsole : 5.1.1
ipython : 7.26.0
ipykernel : 6.1.0
jupyter client : 6.2.0
jupyter lab : not installed
nbconvert : 6.1.0
ipywidgets : 7.6.3
nbformat : 5.1.3
traitlets : 5.0.5
I don't know why this would happen. I would be very grateful if anyone could give me help.
# you've got a strange string
s = 'pýtĥöñ\fis\tawesome\r\n'
s
'pýtĥöñ\x0cis\tawesome\r\n'
# make a small translation table and use translate():
print("\\f:",'\f')
remap = {
ord('\f'): ' ',
ord('\t'): ' ',
ord('\r'): '' # Delete
}
a = s.translate(remap)
a
\f:
'pýtĥöñ is awesome\n'
# remove all combining characters:
import unicodedata
import sys
cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) \
if unicodedata.combining(chr(c)))
b = unicodedata.normalize('NFD', a)
print("b:",b)
b.translate(cmb_chrs)
b: pýtĥöñ is awesome
'python is awesome\n'
# maps all Unicode decimal digit characters to their equivalent in ASCII:
digitmap = {c: ord('0') + unicodedata.digit(chr(c))\
for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd' }
print("len(digitmap):",len(digitmap))
# Arabic digits
x = '\u0661\u0662\u0663'
print("Arabic digits:",x)
print("translate:",x.translate(digitmap))
len(digitmap): 650
Arabic digits: ١٢٣
translate: 123
print('a:',a)
b = unicodedata.normalize('NFD', a)
print('b:',b)
b.encode(encoding='ascii',errors='ignore').decode('ascii')
a: pýtĥöñ is awesome
b: pýtĥöñ is awesome
'python is awesome\n'
2.13. Aligning Text Strings
# ljust() rjust() center() of strings
text = 'Hello World'
print(text.ljust(20,'*'))
print(text.rjust(20))
print(text.center(20,'-'))
Hello World*********
Hello World
----Hello World-----
# format() method of any value
print(format(text,'=>20'))
print(format(text,'<20s'))
print(format(text,'-^20s'))
# format multiple values
print('{:=>10} {:*<10s}'.format('Hello','World'))
# format any value, not specific to strings
x = 123.4567
print(format(x, '*>10'))
print(format(x, '^10.2f'))
=========Hello World
Hello World
----Hello World-----
=====Hello World*****
**123.4567
123.46
?2.14. Combining and Concatenating Strings
1.questions about ','.join(str(d) for d in data) between jupy and CMD
2.yeild combining I/O operations
# join()
parts = ['Is', 'Chicago', 'Not', 'Chicago?']
print('*'.join(parts))
print(''.join(parts))
Is*Chicago*Not*Chicago?
IsChicagoNotChicago?
# + operation
## many + operation is quite a bit slower than join() method
a = 'Is Chicago'
b = 'Not Chicago?'
print('{} {}'.format(a,b))
print(a + ' ' + b)
## combine string literals together in source code, you can simply place
## them adjacent to each other with no + operator
c = 'Wabby' "Wabbo"
print(c)
d = 'Crazy'
"Da\
ve"
print(d)
e = 'Crazy' \
"Da\
ve"
print(e)
Is Chicago Not Chicago?
Is Chicago Not Chicago?
WabbyWabbo
Crazy
CrazyDave
## something wrong here, differences between CMD and jupyter notebook
## jupyter notebook
# strx = ','.join(str(d) for d in data)
# print(strx)
# TypeError: 'str' object is not callable
## CMD
# >>> data = ['ACME', 50, 91.1]
# >>> ','.join(str(d) for d in data)
# 'ACME,50,91.1'
data = ['ACME', '50', '91.1']
a, b, c =data
print(a + ':' + b + ':' + c) # Ugly
print(':'.join([a, b, c])) # Still ugly
print(a, b, c, sep=':') # Better
ACME:50:91.1
ACME:50:91.1
ACME:50:91.1
# Go to learn yeild combining I/O operations
?2.15. Interpolating Variables in Strings
1.? sys._getframe(1)
# format()
s = '{name} has {n} messages.'
print(s.format(name='Guido', n=37))
# s.format(name='Guido')
# KeyError: 'n'
Guido has 37 messages.
## One way to avoid this is to define an alternative dictionary class with a __miss
## ing__() method
class safesub(dict):
def __missing__(self, key):
return '{' + key + '}'
# Now use this class to wrap the inputs to format_map()
name = 'Guido'
n = 37
del n #make sure n is undefined
s.format_map(safesub(vars()))
'Guido has {n} messages.'
# combination of format_map() and vars()
name = 'Guido'
n = 37
print(s.format_map(vars()))
# vars() also works with instances
class Info:
def __init__(self, name, n):
self.name = name
self.n = n
a = Info('Guido', 37)
# s.format(vars(a))
# KeyError: 'name'
print(s.format_map(vars(a)))
Guido has 37 messages.
Guido has 37 messages.
## hide the variable substitution process behind a small utility function
## that employs a so-called “frame hack.”
import sys
def sub(text):
return text.format_map(safesub(sys._getframe(1).f_locals))
name = 'Guido'
n = 37
print(sub('Hello {name}'))
print(sub("You have {n} messages."))
print(sub('Your favorite color is {color}'))
Hello Guido
You have 37 messages.
Your favorite color is {color}
## an alternative to the solution
name = 'Guido'
n = 37
# '%(name) has %(n) messages.' % vars()
# ValueError: unsupported format character 'm' (0x6d) at index 17
## template string
import string
s = string.Template('$name has $n messages.')
s.substitute(vars())
'Guido has 37 messages.'
2.16. Reformatting Text to a Fixed Number of Columns
# textwrap
s = "Look into my eyes, look into my eyes, the eyes, the eyes, \
the eyes, not around the eyes, don't look around the eyes, \
look into my eyes, you're under."
print(s)
import textwrap
print(textwrap.fill(s, 70))
print(textwrap.fill(s, 40))
print(textwrap.fill(s, 70, initial_indent=' '))
print(textwrap.fill(s, 70, subsequent_indent=' '))
Look into my eyes, look into my eyes, the eyes, the eyes, the eyes, not around the eyes, don't look around the eyes, look into my eyes, you're under.
Look into my eyes, look into my eyes, the eyes, the eyes, the eyes,
not around the eyes, don't look around the eyes, look into my eyes,
you're under.
Look into my eyes, look into my eyes,
the eyes, the eyes, the eyes, not around
the eyes, don't look around the eyes,
look into my eyes, you're under.
Look into my eyes, look into my eyes, the eyes, the eyes, the
eyes, not around the eyes, don't look around the eyes, look into my
eyes, you're under.
Look into my eyes, look into my eyes, the eyes, the eyes, the eyes,
not around the eyes, don't look around the eyes, look into my
eyes, you're under.
import os
os.get_terminal_size().columns
157
2.17. Handling HTML and XML Entities in Text
# html.escape()
s = 'Elements are written as "<tag>text</tag>".'
import html
print(s)
print(html.escape(s))
# Disable escaping of quotes
print(html.escape(s, quote=False))
Elements are written as "<tag>text</tag>".
Elements are written as "<tag>text</tag>".
Elements are written as "<tag>text</tag>".
s = 'Spicy Jalapeño'
s.encode('ascii', errors='xmlcharrefreplace')
b'Spicy Jalapeño'
s = 'Spicy "Jalapeño".'
from html.parser import HTMLParser
p = HTMLParser()
## CookBook's code doesn't work
# p.unescape(s)
# AttributeError: 'HTMLParser' object has no attribute 'unescape'
# expected output: 'Spicy "Jalapeño".'
t = 'The prompt is >>>'
from xml.sax.saxutils import unescape
unescape(t)
'The prompt is >>>'
2.18. Tokenizing Text
text = 'foo = 23 + 42 * 10'
import re
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
WS = r'(?P<WS>\s+)'
EQ = r'(?P<EQ>\=)' # don't forget the backslash here
PLUS = r'(?P<PLUS>\+)' # don't forget the backslash here
TIMES = r'(?P<TIMES>\*)' # don't forget the backslash here
all_pat = re.compile('|'.join([NAME,NUM,WS,EQ,PLUS,TIMES]))
print(all_pat)
re.compile('(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)|(?P<NUM>\\d+)|(?P<WS>\\s+)|(?P<EQ>\\=)|(?P<PLUS>\\+)|(?P<TIMES>\\*)')
an interactive example of how a scanner object
>>> scanner = master_pat.scanner('foo = 42')
>>> scanner.match()
<_sre.SRE_Match object at 0x100677738>
>>> _.lastgroup, _.group()
('NAME', 'foo')
>>> scanner.match()
<_sre.SRE_Match object at 0x100677738>
>>> _.lastgroup, _.group()
('WS', ' ')
>>> scanner.match()
<_sre.SRE_Match object at 0x100677738>
>>> _.lastgroup, _.group()
('EQ', '=')
>>> scanner.match()
<_sre.SRE_Match object at 0x100677738>
>>> _.lastgroup, _.group()
('WS', ' ')
>>> scanner.match()
<_sre.SRE_Match object at 0x100677738>
>>> _.lastgroup, _.group()
('NUM', '42')
>>> scanner.match()
>>>
from collections import namedtuple
Token = namedtuple('Token',['type','value'])
def generate_tokens(pat, text):
scanner = pat.scanner(text)
for m in iter(scanner.match, None):
yield Token(m.lastgroup, m.group())
# Example use
for tok in generate_tokens(all_pat, 'foo = 42'):
print(tok)
# filter out all whitespace tokens
tokens = (tok for tok in generate_tokens(all_pat, 'foo = 42') if tok.type != 'WS')
for token in tokens:
print(token)
Token(type='NAME', value='foo')
Token(type='WS', value=' ')
Token(type='EQ', value='=')
Token(type='WS', value=' ')
Token(type='NUM', value='42')
Token(type='NAME', value='foo')
Token(type='EQ', value='=')
Token(type='NUM', value='42')
# order matters. substring, longer first.
LT = r'(?P<LT><)'
LE = r'(?P<LE><=)'
EQ = r'(?P<EQ>=)'
master_pat = re.compile('|'.join([LE,LT,EQ])) # Correct
# master_pat = re.compile('|'.join([LT,LE,EQ])) # Incorrect
## watch out for patterns that form substrings
from collections import namedtuple
import re
Token = namedtuple('Token',['type','value'])
def generate_tokens(pat, text):
scanner = pat.scanner(text)
for m in iter(scanner.match, None):
yield Token(m.lastgroup, m.group())
PRINT = r'(P<PRINT>print)'
NAME = r'(P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
master_pat = re.compile('|'.join([PRINT, NAME]))
for tok in generate_tokens(master_pat, 'printer'):
print(tok)
## Something wrong here, jupyter wouldn't give any output
# Outputs :
# Token(type='PRINT', value='print')
# Token(type='NAME', value='er')
??2.19. Writing a Simple Recursive Descent Parser
# So difficult for me, I choose to skip this section
2.20. Performing Text Operations on Byte Strings
## Byte strings already support most of the same built-in operations as text strings.
bdata = b'Hello World'
print(bdata[0:5])
print(bdata.startswith(b'Hello'))
print(bdata.split())
print(bdata.replace(b'Hello', b'Hello Cruel'))
b'Hello'
True
[b'Hello', b'World']
b'Hello Cruel World'
## Such operations also work with byte arrays.
bdata = bytearray(b'Hello World')
print(bdata[0:5])
print(bdata.startswith(b'Hello'))
print(bdata.split())
print(bdata.replace(b'Hello', b'Hello Cruel'))
bytearray(b'Hello')
True
[bytearray(b'Hello'), bytearray(b'World')]
bytearray(b'Hello Cruel World')
## apply regular expression pattern matching to byte strings, the patterns
## themselves need to be specified as bytes
data = b'FOO:BAR,SPAM'
import re
# re.split('[:,]',data)
# TypeError: cannot use a string pattern on a bytes-like object
re.split(b'[:,]',data)
[b'FOO', b'BAR', b'SPAM']
## a few notable differences
a = 'Hello World'
print(a[0])
b = b'Hello World'
print(b[0])
# bytestring.decode()
print(b)
print(b.decode('ascii'))
H
72
b'Hello World'
Hello World
there are no string formatting operations available to byte strings.
>>> b'%10s %10d %10.2f' % (b'ACME', 100, 490.1)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: unsupported operand type(s) for %: 'bytes' and 'tuple'
>>> b'{} {} {}'.format(b'ACME', 100, 490.1)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
AttributeError: 'bytes' object has no attribute 'format'
>>>
## using normal text strings and encoding to format bytestring
'{:10s} {:10d} {:10.2f}'.format('ACME', 100, 490.1).encode('ascii')
b'ACME 100 490.10'
if you supply a filename encoded as bytes instead of a text string,
it usually disables filename encoding/decoding
>>> # Write a UTF-8 filename
>>> with open('jalape\xf1o.txt', 'w') as f:
... f.write('spicy')
...
>>> # Get a directory listing
>>> import os
>>> os.listdir('.') # Text string (names are decoded)
['jalapeño.txt']
>>> os.listdir(b'.') # Byte string (names left as bytes)
[b'jalapen\xcc\x83o.txt']
>>>