Step by Step of "Web scraping with Python" ----Richard Lawson ---3/n
when trying the sample code of "link_crawler3.py", it will always fail with below message:
/usr/bin/python3 /home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py Downloading:http://example.webscraping.com Downloading--2 Downloading:http://example.webscraping.com Downloading --- 5 {'User-agent': {'User-agent': 'GoodCrawler'}} http://example.webscraping.com Traceback (most recent call last): File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py", line 150, in <module> link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler') File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py", line 36, in link_crawler html = download(url, headers, proxy=proxy, num_retries=num_retries) File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/common.py", line 75, in download5 htmlrsp = opener.open(requestnew) File "/usr/lib/python3.5/urllib/request.py", line 466, in open response = self._open(req, data) File "/usr/lib/python3.5/urllib/request.py", line 484, in _open '_open', req) File "/usr/lib/python3.5/urllib/request.py", line 444, in _call_chain result = func(*args) File "/usr/lib/python3.5/urllib/request.py", line 1282, in http_open return self.do_open(http.client.HTTPConnection, req) File "/usr/lib/python3.5/urllib/request.py", line 1254, in do_open h.request(req.get_method(), req.selector, req.data, headers) File "/usr/lib/python3.5/http/client.py", line 1107, in request self._send_request(method, url, body, headers) File "/usr/lib/python3.5/http/client.py", line 1147, in _send_request self.putheader(hdr, value) File "/usr/lib/python3.5/http/client.py", line 1083, in putheader if _is_illegal_header_value(values[i]): TypeError: expected string or bytes-like object
and I have searched on the internet for seral times, and I think the code is right
def download5(url, user_agent='wswp', proxy=None, num_retries=2): """Download function with support for proxies""" print('Downloading:%s'%url) print('Downloading --- 5') headers = {'User-agent': user_agent} print(headers) print(url) requestnew = request.Request(url, headers=headers) opener = request.build_opener() if proxy: proxy_params = {urlparse.urlparse(url).scheme: proxy} opener.add_handler(request.ProxyHandler(proxy_params)) try: #html = opener.open(requestnew).read().decode('utf-8') htmlrsp = opener.open(requestnew) html = htmlrsp.read().decode('utf-8') except request.URLError as e: print('Download error:%s'%e.reason) html = None if num_retries > 0: if hasattr(e, 'code') and 500 <= e.code < 600: # retry 5XX HTTP errors html = download5(url, user_agent, proxy, num_retries-1) return html
then we check the "/usr/lib/python3.5/http/client.py"
# the patterns for both name and value are more lenient than RFC # definitions to allow for backwards compatibility _is_legal_header_name = re.compile(rb'[^:\s][^:\r\n]*').fullmatch _is_illegal_header_value = re.compile(rb'\n(?![ \t])|\r(?![ \t\n])').search def putheader(self, header, *values): """Send a request header line to the server. For example: h.putheader('Accept', 'text/html') """ if self.__state != _CS_REQ_STARTED: raise CannotSendHeader() if hasattr(header, 'encode'): header = header.encode('ascii') if not _is_legal_header_name(header): raise ValueError('Invalid header name %r' % (header,)) values = list(values) for i, one_value in enumerate(values): if hasattr(one_value, 'encode'): values[i] = one_value.encode('latin-1') elif isinstance(one_value, int): values[i] = str(one_value).encode('ascii') if _is_illegal_header_value(values[i]): raise ValueError('Invalid header value %r' % (values[i],)) value = b'\r\n\t'.join(values) header = header + b': ' + value self._output(header)
#
>>> _is_illegal_header_value = re.compile(rb'\n(?![ \t])|\r(?![ \t\n])').search >>> _is_illegal_header_value('identity') Traceback (most recent call last): File "<stdin>", line 1, in <module> TypeError: cannot use a bytes pattern on a string-like object >>> vl='identity' >>> type(vl) <class 'str'> >>> _is_illegal_header_value(vl) Traceback (most recent call last): File "<stdin>", line 1, in <module> TypeError: cannot use a bytes pattern on a string-like object
##匿名分组(a),(c)
>>> p1=re.compile('(a)b(c)') #匿名分组(a),(c)
# re.
match
(pattern, string, flags=0)
If zero or more characters at the beginning of string match the regular expression pattern, return a corresponding match object. Return None
if the string does not match the pattern; note that this is different from a zero-length match.
Note that even in MULTILINE
mode, re.match()
will only match at the beginning of the string and not at the beginning of each line
>>> m = re.match('a','ASDasd') >>> print(m) None >>> m = re.match('a','aSDasd') >>> print(m) <_sre.SRE_Match object; span=(0, 1), match='a'> >>> m = re.match('a','aaaaaSDasd') >>> print(m) <_sre.SRE_Match object; span=(0, 1), match='a'>
#Pattern.
search
(string[, pos[, endpos]])
Scan through string looking for the first location where this regular expression produces a match, and return a corresponding match object. Return None
if no position in the string matches the pattern;
note that this is different from finding a zero-length match at some point in the string.
>>> pattern = re.compile("d") >>> pattern.search("dog") <_sre.SRE_Match object; span=(0, 1), match='d'> >>> pattern.search("ogdd") <_sre.SRE_Match object; span=(2, 3), match='d'> >>> pattern.search("ogddddd") <_sre.SRE_Match object; span=(2, 3), match='d'> >>> pattern.search("ogddddd",1) <_sre.SRE_Match object; span=(2, 3), match='d'> >>> pattern.search("ogddddd",2) <_sre.SRE_Match object; span=(2, 3), match='d'> >>> pattern = re.compile("ddd") >>> pattern.search("dog") >>> pattern.search("ogdd") >>> pattern.search("ogddddd") <_sre.SRE_Match object; span=(2, 5), match='ddd'> >>> pattern.search("ogddddd",2) <_sre.SRE_Match object; span=(2, 5), match='ddd'>