[Python]爬虫实例-深大新闻标题
#单页
import requests from lxml import etree #1.页面获取 url = "https://www.szu.edu.cn/index/mtsd.htm" response = requests.get(url) response.encoding="utf-8" wb_data = response.text html = etree.HTML(wb_data) #print(wb_data) #2.数据定位 infos = html.xpath("//ul[@class='news-list']/li/a/text()") for info in infos: print(info)
#多页 import requests from lxml import etree #1.链接处理 urls = ["https://www.szu.edu.cn/index/mtsd/{}.htm".format(i)for i in range(1,46)] urls.append("https://www.szu.edu.cn/index/mtsd.htm") urls = urls[::-1] #2.写入文件 f = open("深大爬虫.csv","w",encoding = "ANSI") fileheader = ["标题"] dict_writer = csv.DictWriter(f,fileheader) dict_writer.writeheader() #页面信息(翻页) for url in urls: response = requests.get(url) response.encoding="utf-8" wb_data = response.text html = etree.HTML(wb_data) #3.数据定位 (每一页) infos = html.xpath("//ul[@class='news-list']/li/a/text()") for info in infos: dict_writer.writerow({"标题":info}) f.close()
# UTF-8 改 ANSI,否则 在excel中打开会乱码