[Python]爬虫实例-深大新闻标题

 #单页
import requests
from lxml import etree

#1.页面获取
url = "https://www.szu.edu.cn/index/mtsd.htm"
response = requests.get(url)
response.encoding="utf-8"
wb_data = response.text
html = etree.HTML(wb_data)
#print(wb_data)

#2.数据定位
infos = html.xpath("//ul[@class='news-list']/li/a/text()")
for info in infos:
    print(info)

 

 

#多页
import requests
from lxml import etree

#1.链接处理
urls = ["https://www.szu.edu.cn/index/mtsd/{}.htm".format(i)for i in range(1,46)]
urls.append("https://www.szu.edu.cn/index/mtsd.htm")
urls = urls[::-1]
#2.写入文件
f = open("深大爬虫.csv","w",encoding = "ANSI")
fileheader = ["标题"]
dict_writer = csv.DictWriter(f,fileheader)
dict_writer.writeheader()
#页面信息(翻页)
for url in urls:
    response = requests.get(url)
    response.encoding="utf-8"
    wb_data = response.text
    html = etree.HTML(wb_data)
    #3.数据定位 (每一页)
    infos = html.xpath("//ul[@class='news-list']/li/a/text()")
    for info in infos:
        dict_writer.writerow({"标题":info})
f.close()

 

# UTF-8 改 ANSI,否则 在excel中打开会乱码

 

 

posted @ 2020-06-09 17:23  SkyBiuBiu  阅读(244)  评论(0编辑  收藏  举报