使用requests+pyquery爬取dd373地下城跨五最新商品信息
废话不多说直接上代码:
可以使用openpyel库对爬取的信息写入Execl表格中代码我就不上传了
import requests from urllib.parse import urlencode from requests import RequestException from pyquery import PyQuery as pq def open_sh(): #获取dd373html信息 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } data = { "minPrice":333, "maxPrice":"" } url = "https://www.dd373.com/s/rbg22w-x9kjbs-wwf11b-0-0-0-qquvn4-0-0-0-0-0-0-0-0.html?"+urlencode(data) try: response = requests.get(url,headers=headers) if response.status_code == 200: return response.text return None except RequestException: print("链接错误",url) return None def doc_page(html): # 获取地下城账号信息 doc = pq(html) content = doc("div.content") titleText = content.find(".box.money_ner").items() for items in titleText: product = { "地址":items.find("a.titleText").attr("href"), "账号信息":items.find("a.titleText").text(), "价格":items.find("div.money_text strong span").text()+'元', "是否存在":items.find("div.num.left").text() } print(product) def page_sh(pagebox): # 循环遍历所有分页 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } data = { "minPrice": 333, "maxPrice": "" } for page in range(1,pagebox+1): url = "https://www.dd373.com/s/rbg22w-x9kjbs-wwf11b-0-0-0-qquvn4-0-0-0-0-0-0-0-%s.html?%s"%(page,urlencode(data)) try: page1 = page_currentpage(url) if page1==page: response = requests.get(url, headers=headers) if response.status_code == 200: doc_page(response.text) except Exception as e: raise e def page_currentpage(html): # 获取分页中被高亮的页数用于判断是否在 当前页面 doc = pq(html) currentpage= doc("a.nb.currentpage").text() return int(currentpage) def page_box(html): # 获取所有的页码 doc = pq(html) pagebox = doc(".pagebox.clear ul li.yeshu").text()[9:-1] return int(pagebox) def main(): html = open_sh() page = page_box(html) page_sh(page) if __name__ == "__main__": main()