Step by Step of "Web scraping with Python" ----Richard Lawson ---2/n

1. Debian 9 + Python 3.5.3

"python3 --version"

link_crawler3.py
# -*- coding: utf-8 -*-

import re
import queue
import time
from common import download
from urllib import request 
from urllib import robotparser
from urllib.parse import urljoin
from urllib.parse import urlparse
from datetime import datetime


def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1):
    """
    Crawl from the given seed URL following links matched by link_regex
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = queue.deque([seed_url])
    # the URL's that have been seen and at what depth
    seen = {seed_url: 0}
    # track how many URL's have been downloaded
    num_urls = 0
    rp = Throttle.get_robots(seed_url)
    throttle = Throttle(delay)
    headers = headers or {}
    if user_agent:
        headers['User-agent'] = user_agent


    while crawl_queue:
        url = crawl_queue.pop()
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url):
            throttle.wait(url)
            html = download(url, headers, proxy=proxy, num_retries=num_retries)
            links = []

            depth = seen[url]
            if depth != max_depth:
                # can still crawl further
                if link_regex:
                    # filter for links matching our regular expression
                    links.extend(link for link in get_links(html) if re.match(link_regex, link))


                    for link in links:
                        link = normalize(seed_url, link)
                        # check whether already crawled this link
                        if link not in seen:
                            seen[link] = depth + 1
                            # check link is within same domain
                            if same_domain(seed_url, link):
                                # success! add this new link to queue
                                crawl_queue.append(link)


            # check whether have reached downloaded maximum
            num_urls += 1
            if num_urls == max_urls:
                break
            else:
                print('Blocked by robots.txt:%s'%url)



class Throttle:
    """
    Throttle downloading by sleeping between requests to same domain
    """
    def __init__(self, delay):
        # amount of delay between downloads for each domain
        self.delay = delay
        # timestamp of when a domain was last accessed
        self.domains = {}


    def wait(self, url):
        domain = urlparse(url).netloc
        last_accessed = self.domains.get(domain)


        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
            if sleep_secs > 0:
                time.sleep(sleep_secs)
        self.domains[domain] = datetime.now()


    def download(url, headers, proxy, num_retries, data=None): 
        print('Downloading:%s'%url)
        request = request.Request(url, data, headers)
        opener = request.build_opener()
        if proxy:
            proxy_params = {urlparse.urlparse(url).scheme: proxy}
            opener.add_handler(request.ProxyHandler(proxy_params))

        try:
            response = opener.open(request)
            html = response.read()
            code = response.code

        except request.URLError as e:
            print('Download error:%s'%e.reason)
            html = ''
            if hasattr(e, 'code'):
                code = e.code
                if num_retries > 0 and 500 <= code < 600:
                    # retry 5XX HTTP errors
                    return download(url, headers, proxy, num_retries-1, data)
                else:
                    code =None

            return html


    def normalize(seed_url, link):
        """
        Normalize this URL by removing hash and adding domain
        """
        link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
        return urlparse.urljoin(seed_url, link)

    def same_domain(url1, url2):
        """
        Return True if both URL's belong to same domain
        """
        return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc

    def get_robots(url):
        """
        Initialize robots parser for this domain
        """
        rp = robotparser.RobotFileParser()
        rp.set_url(urljoin(url, '/robots.txt'))
        rp.read()
        return rp

    def get_links(html):
        """
        Return a list of links from html 
        """
        # a regular expression to extract all links from the webpage
        webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
        # list of all links from the webpage
        return webpage_regex.findall(html)

if __name__ == '__main__':
    link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler')
    link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')

  

  2.  it seems we should learn more about "urllib"

cor@debian:~$ /usr/bin/python3 /home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py
Traceback (most recent call last):
  File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py", line 147, in <module>
    link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler')
  File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py", line 22, in link_crawler
    rp = Throttle.get_robots(seed_url)
  File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py", line 133, in get_robots
    rp.set_url(urlparse.urljoin(url, '/robots.txt'))
AttributeError: 'function' object has no attribute 'urljoin'

  

3.  

TypeError: expected string or bytes-like object
cor@debian:~$ /usr/bin/python3 /home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py
Downloading:http://example.webscraping.com
Downloading--2
Downloading:http://example.webscraping.com
Downloading --- 5
Downloading:http://example.webscraping.com
Downloading --- 5
Traceback (most recent call last):
  File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py", line 150, in <module>
    link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')
  File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/link_crawler3.py", line 36, in link_crawler
    html = download(url, headers, proxy=proxy, num_retries=num_retries)
  File "/home/cor/webscrappython/Web_Scraping_with_Python/chapter01/common.py", line 72, in download5
    html = opener.open(requestnew).read().decode('utf-8')
  File "/usr/lib/python3.5/urllib/request.py", line 466, in open
    response = self._open(req, data)
  File "/usr/lib/python3.5/urllib/request.py", line 484, in _open
    '_open', req)
  File "/usr/lib/python3.5/urllib/request.py", line 444, in _call_chain
    result = func(*args)
  File "/usr/lib/python3.5/urllib/request.py", line 1282, in http_open
    return self.do_open(http.client.HTTPConnection, req)
  File "/usr/lib/python3.5/urllib/request.py", line 1254, in do_open
    h.request(req.get_method(), req.selector, req.data, headers)
  File "/usr/lib/python3.5/http/client.py", line 1107, in request
    self._send_request(method, url, body, headers)
  File "/usr/lib/python3.5/http/client.py", line 1147, in _send_request
    self.putheader(hdr, value)
  File "/usr/lib/python3.5/http/client.py", line 1083, in putheader
    if _is_illegal_header_value(values[i]):
TypeError: expected string or bytes-like object

  4.   below is "cpmmon"

 

# -*- coding: utf-8 -*-

from urllib import  request


def download1(url):
    """Simple downloader"""
    # before
    #return urllib.urlopen(url).read()
    #after, using urllib.request instead
    print('Downloading--1')
    return request.urlopen(url)

from urllib import  request
def download2(url):
    """Download function that catches errors"""
    print('Downloading:%s'%url)
    print('Downloading--2')
    try:
        html = request.urlopen(url).read()
    except request.URLError as e:
        print('Download error:%s'%e.reason)
        html = None
    return html
download2('http://example.webscraping.com')

def download3(url, num_retries=2):
    """Download function that also retries 5XX errors"""
    print('Downloading:%s'%url)
    print('Downloading--3')
    try:
        html = request.urlopen(url).read()
    except request.URLError as e:
        print('Download error:%s'% e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download3(url, num_retries-1)
    return html


def download4(url, user_agent='wswp', num_retries=2):
    """Download function that includes user agent support"""
    print('Downloading:%s'%url)
    print('Downloading--4')
    headers = {'User-agent': user_agent}
    requestnew = request.Request(url, headers=headers)
    try:
        html = request.urlopen(requestnew).read()
    except request.URLError as e:
        print('Download error:%s'%e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download4(url, user_agent, num_retries-1)
    return html


def download5(url, user_agent='wswp', proxy=None, num_retries=2):
    """Download function with support for proxies"""
    print('Downloading:%s'%url)
    print('Downloading --- 5')
    headers = {'User-agent': user_agent}
    requestnew = request.Request(url, headers=headers)
    opener = request.build_opener()
    if proxy:
        proxy_params = {urlparse.urlparse(url).scheme: proxy}
        opener.add_handler(request.ProxyHandler(proxy_params))
    try:
        html = opener.open(requestnew).read().decode('utf-8')
    except request.URLError as e:
        print('Download error:%s'%e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download5(url, user_agent, proxy, num_retries-1)
    return html


download = download5


if __name__ == '__main__':
    print(download('http://example.webscraping.com'))

5.  

 

>>> import urllib.parse
>>> import urllib.request
>>> url = 'http://www.someserver.com/cgi-bin/register.cgi'
>>> user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'\
... 
KeyboardInterrupt
>>> user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
>>> url
'http://www.someserver.com/cgi-bin/register.cgi'
>>> values = {'name': 'Michael Foord',
...           'location': 'Northampton',
...           'language': 'Python' }
>>> headers = {'User-Agent': user_agent}

>>> data=urllib.parse.urlencode(values)
>>> data
'location=Northampton&language=Python&name=Michael+Foord'
>>> data = data.encode('ascii')
>>> data
b'location=Northampton&language=Python&name=Michael+Foord'
>>> req = urllib.request.Request(url, data, headers)
>>> with urllib.request.urlopen(req) as response:
... the_page = response.read()
... print(the_page)

 

b'<!DOCTYPE html><html><head><meta http-equiv="x-ua-compatible" content="IE=edge"><title></title><script type="text/javascript">(function() {var p = "eyJ1cmkiOiIvY2dpLWJpbi9yZWdpc3Rlci5jZ2kiLCJhcmdzIjoiIiwicmVmZXJlciI6IiJ9:1jHNCg:E4Xczfh7oF8UHMBAouFg0z9KGN8", as = "http://www.someserver.com/mtm/async/", f = "http://www.someserver.com/mtm/direct/";function d(n){window.location.href = "http://www42.someserver.com/"+n;}function ar(r) {if (r.slice(0, 1) !== ".") {try {window.location.assign(r);} catch (err) {}try {var mar = document.createElement("meta");mar.httpEquiv = "refresh";mar.content = "0;url="+r;document.getElementsByTagName("head")[0].appendChild(mar);} catch (err) {}} else {var s = document.createElement("span");s.id="ecode";s.appendChild(document.createTextNode(r.slice(1)));document.getElementsByTagName("body")[0].appendChild(s);}}if ("fetch" in window) {try {fetch(as + p + "/1", {credentials: "include"}).then(function(r) {if (!r.ok) {throw Error("50x");}return r.text();}).then(function(r) {ar(r);});} catch (err) {d(2);}} else {try {var x = new XMLHttpRequest();x.open("GET", as + p + "/2", false);x.onerror = function() {d(3);};x.onload = function() {if (x.status === 200) {ar(x.responseText);} else {d(4);}};x.onreadystatechange = function(r) {if (x.readyState === 4){if (x.status === 200) {ar(x.responseText);} else {d(6);}}};x.send();} catch (err) {d(5);}}})();</script><meta http-equiv="refresh" content="5;url=http://www.someserver.com/mtm/direct/eyJ1cmkiOiIvY2dpLWJpbi9yZWdpc3Rlci5jZ2kiLCJhcmdzIjoiIiwicmVmZXJlciI6IiJ9:1jHNCg:E4Xczfh7oF8UHMBAouFg0z9KGN8/1" /></head><body></body></html>'

>>>

  

 

posted @ 2020-03-26 16:13  碧水东流至此回  阅读(284)  评论(0编辑  收藏  举报