爬取某网站部分信息,由于页面过多,采用多线程方式,提高爬取速度,完整代码如下
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup as Bs4
import threading
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
"Accept-Language": "zh-CN,zh;q=0.9"
}
name_list = []
def url_text(url,n):
response = requests.get(url,headers = headers)
response.encoding = 'utf-8'
try:
soup = Bs4(response.text,'lxml')
urls_name = soup.select(".list-item")
data_dict={}
for urls in urls_name:
text_data1 = urls.select(".dp-b")
n = n + 1
data_dict["id"] = n
for i in text_data1:
data_dict["name"] = i.text.strip()
# print(data_dict)
text_data2 = urls.select(".content-img")
for j in text_data2:
data_dict["data"] = j.text.strip()
print(data_dict)
with open("smiles_0716.txt", "a+", encoding="utf-8") as f:
f.write(str(data_dict)+"\n")
except:
print("请求出错")
if __name__ == "__main__":
n = 0
for num in range(1,20):
url = "https://www.xxx.com/index_{}.html".format(num)
"""
- 你写好代码
- 交给解释器运行: python thread1.py
- 解释器读取代码,再交给操作系统去执行,根据你的代码去选择创建多少个线程/进程去执行(单进程/多线程)。
- 操作系统调用硬件:硬盘、cpu、网卡....
"""
t = threading.Thread(target=url_text, args=(url,n,))
t.start()
n = n + 10