Step by Step of "Web scraping with Python" ----Richard Lawson ---3/n

when trying the sample code of "link_crawler3.py", it will always fail with below message:

/usr/bin/python3 /home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py
Downloading:http://example.webscraping.com
Downloading--2
Downloading:http://example.webscraping.com
Downloading --- 5
{'User-agent': {'User-agent': 'GoodCrawler'}}
http://example.webscraping.com
Traceback (most recent call last):
  File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py", line 150, in <module>
    link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')
  File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py", line 36, in link_crawler
    html = download(url, headers, proxy=proxy, num_retries=num_retries)
  File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/common.py", line 75, in download5
    htmlrsp = opener.open(requestnew)
  File "/usr/lib/python3.5/urllib/request.py", line 466, in open
    response = self._open(req, data)
  File "/usr/lib/python3.5/urllib/request.py", line 484, in _open
    '_open', req)
  File "/usr/lib/python3.5/urllib/request.py", line 444, in _call_chain
    result = func(*args)
  File "/usr/lib/python3.5/urllib/request.py", line 1282, in http_open
    return self.do_open(http.client.HTTPConnection, req)
  File "/usr/lib/python3.5/urllib/request.py", line 1254, in do_open
    h.request(req.get_method(), req.selector, req.data, headers)
  File "/usr/lib/python3.5/http/client.py", line 1107, in request
    self._send_request(method, url, body, headers)
  File "/usr/lib/python3.5/http/client.py", line 1147, in _send_request
    self.putheader(hdr, value)
  File "/usr/lib/python3.5/http/client.py", line 1083, in putheader
    if _is_illegal_header_value(values[i]):
TypeError: expected string or bytes-like object

  and I have searched on the internet for seral times, and I think the code is right

def download5(url, user_agent='wswp', proxy=None, num_retries=2):
    """Download function with support for proxies"""
    print('Downloading:%s'%url)
    print('Downloading --- 5')
    headers = {'User-agent': user_agent}
    print(headers)
    print(url)
    requestnew = request.Request(url, headers=headers)
    opener = request.build_opener()
    if proxy:
        proxy_params = {urlparse.urlparse(url).scheme: proxy}
        opener.add_handler(request.ProxyHandler(proxy_params))
    try:
        #html = opener.open(requestnew).read().decode('utf-8')
        htmlrsp = opener.open(requestnew)
        html = htmlrsp.read().decode('utf-8')

    except request.URLError as e:
        print('Download error:%s'%e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download5(url, user_agent, proxy, num_retries-1)
    return html

  then we check the "/usr/lib/python3.5/http/client.py"

 

# the patterns for both name and value are more lenient than RFC
# definitions to allow for backwards compatibility
_is_legal_header_name = re.compile(rb'[^:\s][^:\r\n]*').fullmatch
_is_illegal_header_value = re.compile(rb'\n(?![ \t])|\r(?![ \t\n])').search


    def putheader(self, header, *values):
        """Send a request header line to the server.

        For example: h.putheader('Accept', 'text/html')
        """
        if self.__state != _CS_REQ_STARTED:
            raise CannotSendHeader()

        if hasattr(header, 'encode'):
            header = header.encode('ascii')

        if not _is_legal_header_name(header):
            raise ValueError('Invalid header name %r' % (header,))

        values = list(values)
        for i, one_value in enumerate(values):
            if hasattr(one_value, 'encode'):
                values[i] = one_value.encode('latin-1')
            elif isinstance(one_value, int):
                values[i] = str(one_value).encode('ascii')

            if _is_illegal_header_value(values[i]):
                raise ValueError('Invalid header value %r' % (values[i],))

        value = b'\r\n\t'.join(values)
        header = header + b': ' + value
        self._output(header)

  #

 

>>> _is_illegal_header_value = re.compile(rb'\n(?![ \t])|\r(?![ \t\n])').search
>>> _is_illegal_header_value('identity')
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
TypeError: cannot use a bytes pattern on a string-like object
>>> vl='identity'
>>> type(vl)
<class 'str'>
>>> _is_illegal_header_value(vl)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
TypeError: cannot use a bytes pattern on a string-like object

  ##匿名分组(a),(c)

>>> p1=re.compile('(a)b(c)')                         #匿名分组(a),(c)

 

 

#  re.match(patternstringflags=0)

If zero or more characters at the beginning of string match the regular expression pattern, return a corresponding match object. Return None if the string does not match the pattern; note that this is different from a zero-length match.

Note that even in MULTILINE mode, re.match() will only match at the beginning of the string and not at the beginning of each line

 

>>> m = re.match('a','ASDasd')
>>> print(m)
None
>>> m = re.match('a','aSDasd')
>>> print(m)
<_sre.SRE_Match object; span=(0, 1), match='a'>
>>> m = re.match('a','aaaaaSDasd')
>>> print(m)
<_sre.SRE_Match object; span=(0, 1), match='a'>

  

#Pattern.search(string[, pos[, endpos]])

Scan through string looking for the first location where this regular expression produces a match, and return a corresponding match object. Return None if no position in the string matches the pattern;

note that this is different from finding a zero-length match at some point in the string.

>>> pattern = re.compile("d")
>>> pattern.search("dog")
<_sre.SRE_Match object; span=(0, 1), match='d'>
>>> pattern.search("ogdd")
<_sre.SRE_Match object; span=(2, 3), match='d'>
>>> pattern.search("ogddddd")
<_sre.SRE_Match object; span=(2, 3), match='d'>
>>> pattern.search("ogddddd",1)
<_sre.SRE_Match object; span=(2, 3), match='d'>
>>> pattern.search("ogddddd",2)
<_sre.SRE_Match object; span=(2, 3), match='d'>
>>> pattern = re.compile("ddd")
>>> pattern.search("dog")
>>> pattern.search("ogdd")
>>> pattern.search("ogddddd")
<_sre.SRE_Match object; span=(2, 5), match='ddd'>
>>> pattern.search("ogddddd",2)
<_sre.SRE_Match object; span=(2, 5), match='ddd'>

  

 

posted @ 2020-03-30 15:21  碧水东流至此回  阅读(318)  评论(0编辑  收藏  举报