Python抓取网页例子

功能

  1. 一级分页列表页, 二级数据页
  2. 不定表头, 写入CSV
  3. 正则匹配, 在()中使用?:实现只匹配, 不捕获
  4. HTTP头设置
#!/usr/bin/python3
# -*- coding: UTF-8 -*-

import re
import time
import requests
import csv

session = requests.session()

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0',
    'Referer':'http://jtgl.beijing.gov.cn/jgj/93950/check_car/ycl/index.html',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,zh-TW;q=0.8,zh-CN;q=0.7,zh;q=0.5,en;q=0.3,de-DE;q=0.2',
    'Accept-Encoding': 'gzip, deflate',
    'Cache-Control': 'max-age=0'
}

base_link = 'http://somewhere'

'''
GET /jgj/93950/check_car/ycl/22719550-2.html HTTP/1.1
Host: jtgl.beijing.gov.cn
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
Accept-Language: en-US,zh-TW;q=0.8,zh-CN;q=0.7,zh;q=0.5,en;q=0.3,de-DE;q=0.2
Accept-Encoding: gzip, deflate
Referer: http://jtgl.beijing.gov.cn/jgj/93950/check_car/ycl/index.html
Connection: keep-alive
Cookie: _trs_uv=kde6gvt0_365_j8m; _va_id=ec5b1ecd783dfc52.1596438844.30.1615800849.1615799023.; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22173b32ba72347-0efda1230057718-4c302273-2073600-173b32ba72436a%22%7D; _va_ref=%5B%22%22%2C%22%22%2C1615800833%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D54coDM4TQyJUBe5UqX7uRRIy7UXZ8zHoufRE-ufZk-8fMFrJRtMfs_TQ-sPD2UYtZnPmS86B0DUph3QiQ_8j0sKSmN4ZJVuCRurpig3dJ3W%26wd%3D%26eqid%3Db7d847b70004074600000003604f0468%22%5D; __jsluid_h=9c085f396ee2cc91e800331f4d8fd4a8; _va_ses=*
Upgrade-Insecure-Requests: 1
If-Modified-Since: Mon, 15 Mar 2021 06:28:19 GMT
If-None-Match: W/"604efe83-a36a"
Cache-Control: max-age=0
'''

def request_get(url, encoding='UTF-8', tout=20, retries=10):
    count = 0
    while True:
        count += 1
        if (count > retries):
            print('Exceed retry limit')
            return None
        time.sleep(0.2)
        try:
            response = session.get(url, timeout=tout, headers = header)
            response.encoding = encoding
            #print(response.text)
            return response.text
        except requests.ReadTimeout:
            print('ReadTimeout')
            continue
        except ConnectionError:
            print('ConnectionError')
            continue
        except requests.RequestException:
            print('RequestException')
            continue

def lv1_to_lv2(page):
    link_lv1 = base_link + '/jgj/93950/check_car/ycl/22719550-'+str(page)+'.html'
    content = request_get(link_lv1, 'UTF-8', 20, 10)

    links_lv2 = []
    result = re.compile(r'<p class="content_li_title"><a href="[^"]+" onclick').findall(content)
    if (len(result) > 0):
        for line in result:
            match = re.match(r'<p class="content_li_title"><a href="([^"]+)" onclick', line)
            link_lv2 = match.group(1)
            links_lv2.append(link_lv2)
    return links_lv2

def lv2_to_data(link_lv2):
    link_lv2= base_link + link_lv2
    content = request_get(link_lv2, 'UTF-8', 20, 10)
    data = {}
    result = re.compile(r'<p class="titles">\d+\.?年\d+月\d+日全市检测场实际(?:检|验)').findall(content)
    if (len(result) > 0):
        match = re.match(r'<p class="titles">(\d+)\.?年(\d+)月(\d+)日全市检测场实际(?:检|验)', result[0])
        data['year'] = match.group(1)
        data['month'] = match.group(2)
        data['date'] = match.group(3)
    else:
        data['year'] = 0
        data['month'] = 0
        data['date'] = 0

    content = remove_style(content)
    result = re.compile(r'<tr><td>\d+</td>(?:<td>[^<]+</td>)?<td>[^<]+</td><td>\d+</td>(?:<td></td>)?</tr>').findall(content)
    rows = []
    if (len(result) > 0):
        for line in result:
            match = re.match(r'<tr><td>\d+</td>(?:<td>[^<]+</td>)?<td>([^<]+)</td><td>(\d+)</td>(?:<td></td>)?</tr>', line)
            row = {}
            row['name'] = match.group(1)
            row['name'] = row['name'].replace('&nbsp;', '')
            row['value'] = match.group(2)
            rows.append(row)
    data['rows'] = rows
    return data


def remove_style(text):
    css_pattern = re.compile('(\s+|<span[^>]*>|</span>|<p[^>]*>|</p>|</?strong>|<font[^>]+>|</font>|rowspan="\d+"|class="[^"]+"|style="[^"]+"|height="\d+"|width="\d+"|x:num="\d+"|x:str="")')
    return css_pattern.sub(r'', text)


# main process
csv_rows = []
csv_headers = {}

for page in range(1, 81, 1):
    print(page)
    links_lv2 = lv1_to_lv2(page)
    if (len(links_lv2) > 0):
        for link_lv2 in links_lv2:
            print(link_lv2)
            data = lv2_to_data(link_lv2)
            csv_rows.append(data)
            if (len(data['rows']) > 0):
                for row in data['rows']:
                    if (row['name'] not in csv_headers):
                        csv_headers[row['name']] = 1
            #time.sleep(1)

with open('output.csv', 'w', encoding='utf8', newline='') as csvfile:
    fieldnames = []
    fieldnames.append('date')
    fieldnames.extend(csv_headers.keys())
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for csv_row in csv_rows:
        row = {}
        row['date'] = '{}.{}.{}'.format(csv_row['year'], csv_row['month'], csv_row['date'])
        for f in csv_row['rows']:
            row[f['name']] = f['value']
        writer.writerow(row)


posted on 2021-03-16 01:54  Milton  阅读(186)  评论(0编辑  收藏  举报

导航