Python-微信小程序信息的爬取
import requests import csv from lxml import html from bs4 import BeautifulSoup Header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3928.4 Safari/537.36' } def getdata(url): resp=requests.get(url) ht=resp.text soup=BeautifulSoup(ht,"lxml") title=soup.find('h1',class_='ph') print(title.text) p = soup.find('div', class_='blockquote') print(p.text) time=soup.find('span',class_='time') print(time.text) #保存在csv文件当中 with open("微信小程序.csv", "a", newline="") as cf: w = csv.writer(cf) w.writerow([title.text, p.text, time.text]) cf.close() def parse_page(url): resp=requests.get(url,headers=Header) resp.encoding = resp.apparent_encoding temp = resp.text ht = html.fromstring(temp) informations = ht.xpath('//*[@id="itemContainer"]/div/div/h3/a') #抓取在该网页下的超链接 for inf in informations: url2 = "http://www.wxapp-union.com/" + inf.get('href') getdata(url2) print('微信小程序全部爬取完成') def began(): #小程序页面共计有107页 url = "http://www.wxapp-union.com/portal.php?mod=list&catid=1&page={}" for i in range(1,108): new_url=url.format(i) parse_page(new_url) if __name__ == '__main__': began()