# coding: utf-8 import datetime import urllib.parse import urllib.request from urllib.error import * from bs4 import BeautifulSoup import re import os def get_html(url, values): html = '' status_code = 200 user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36' headers = {'User-Agent': user_agent} data = urllib.parse.urlencode(values).encode(encoding='UTF8') for i in range(1,3): req = urllib.request.Request(url=url, headers=headers, data=data) try: response = urllib.request.urlopen(req) except HTTPError as e: print(url, values) print('The server couldn\'t fulfill the request.') print('HTTP Error,code: ', e.code) status_code = int(e.code) break except URLError as e: status_code = int(e.code) print('We failed to reach a server.Reason: ', e.reason) print('url: %s, status code:%d, retry count:%d' % (url + '?' + bytes.decode(data), status_code, i)) else: html = response.read( ).decode('gbk') break return html, status_code def request_page(page): url = 'http://cmispub.cicpa.org.cn/cicpa2_web/PersonIndexAction.do' values = { 'method': 'indexQuery', 'queryType': '2', 'isStock': '00', 'pageSize': '', 'pageNum': page, 'offName': '', 'ascGuid': '', 'perCode': 0, 'perName': '' } return get_html(url, values) def parse_cicpa_page(html): soup = BeautifulSoup(html, 'html.parser') items = soup.select("#tabDetail a") return items def request_detail(code): print('request code:', code) url = 'http://cmispub.cicpa.org.cn/cicpa2_web/07/' + code + '.shtml' values = {} return get_html(url, values) def parse_detail_header(html): soup = BeautifulSoup(html, 'html.parser') headers = soup.select("#detailtb td.tdl") line = '' for item in headers: line = line + item.get_text( ).strip( ) + ',' line = line.strip(',') return line def parse_detail_content(html): soup = BeautifulSoup(html, 'html.parser') headers = soup.select("#detailtb td.data_tb_content") line = '' for item in headers: line = line + item.get_text( ).strip( ) + ',' line = line.strip(',') return line def create_file(filepath, header): file_dir = os.path.split(filepath)[0] if not os.path.isdir(file_dir): os.makedirs(file_dir) if not os.path.exists(filepath): f = open(filepath, 'w') if len(header) > 0: f.write(header + '\n') f.close( ) def is_down_exists(code): return False def main(): start_time = datetime.datetime.now( ) html_dir = 'D:/crawl_data/cicpa/html/' if not os.path.isdir(html_dir): os.makedirs(html_dir) header_file = 'D:/crawl_data/cicpa/header.csv' need_header = not os.path.exists(header_file) datafile = 'D:/crawl_data/cicpa/data_%s.csv' % start_time.strftime("%Y%m%d_%H%M%S_%f") page_error_file = 'D:/crawl_data/cicpa/error_page_%s.txt' % start_time.strftime("%Y%m%d_%H%M%S_%f") detail_error_file = 'D:/crawl_data/cicpa/error_detail_%s.txt' % start_time.strftime("%Y%m%d_%H%M%S_%f") create_file(datafile, '') create_file(page_error_file, 'page,status') create_file(detail_error_file, 'code,status') data_file_object = open(datafile, '+w') page_error_file_object = open(page_error_file, '+w') detail_error_file_object = open(detail_error_file, '+w') for i in range(1, 6912): print('request:', i) result, status = request_page(i) if status != 200: page_error_file_object.write(str(i) + ',' + str(status) + '\n') page_error_file_object.flush( ) continue items = parse_cicpa_page(result) for item in items: code = re.findall(r"javascript:viewDetail\(\'(\w+?)\',", str(item))[0] html_file_path = html_dir + code + '.html' if os.path.exists(html_file_path): continue detail_html, status = request_detail(code) if len(detail_html) == 0: detail_error_file_object.write(code + ',%d\n' % status) detail_error_file_object.flush( ) continue if need_header: header = parse_detail_header(detail_html) f = open(header_file, 'w') f.write(header + '\n') f.close() need_header = False # save base data line = parse_detail_content(detail_html) data_file_object.write(line + '\n') data_file_object.flush( ) # save html html_file_object = open(html_file_path, 'w') html_file_object.write(detail_html + '\n') html_file_object.close( ) print(line) data_file_object.close( ) page_error_file_object.close( ) detail_error_file_object.close( ) print('finished in', (datetime.datetime.now( ) - start_time).microseconds, 'ms') if __name__ == '__main__': main( )