爬取genome的网页和图片
# -*- coding: utf-8 -*- # @Time : 2018/03/08 10:32 # @Author : cxa # @File : gethtmlandimg.py # @Software: PyCharm import requests from fake_useragent import UserAgent as UA from lxml import html import traceback import os url = "http://www.genome.jp/kegg-bin/show_pathway?1520394169137283/hsa01100.args" html_path = os.path.join(os.getcwd(), url.split("/")[-1].replace("args", "html")) img_path = os.path.join(os.getcwd(), url.split("/")[-1].replace("args", "png")) headers = {'Accept': 'text/html, application/xhtml+xml, image/jxr, */*', 'Accept - Encoding': 'gzip, deflate', 'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.5', 'Connection': 'Keep-Alive', 'User-Agent': UA().random} img_xapth = "//div[@class='map']/div[@class='image']/img[@name='pathwayimage']/@src" main_url = "http://www.genome.jp" def get_img(buff): with open(img_path, "wb") as fs: fs.write(buff) req = requests.get(url, timeout=20, headers=headers) try: if req.status_code == requests.codes.ok: get_html = req.text root = html.fromstring(get_html) imgurl = main_url + root.xpath(img_xapth)[0] with open(html_path, "w") as fs: fs.write(get_html.replace(root.xpath(img_xapth)[0],"./{}".format(url.split("/")[-1].replace("args", "png")))) img_req = requests.get(imgurl, headers=headers) if img_req.status_code == requests.codes.ok: buff = img_req.content get_img(buff) else: img_req.raise_for_status() else: req.raise_for_status() except: print(traceback.format_exc())