pyqt5中多线程爬虫
设立爬虫Class,继承pyqt5中的Thread
函数中使用普通线程
整体代码:
import sys import pandas as pd import json,requests,time,threading from PyQt5.QtWidgets import QMainWindow, QApplication,QVBoxLayout,QMessageBox from ui.ui_test import Ui_MainWindow from PyQt5.QtGui import QStandardItemModel,QStandardItem from PyQt5.QtCore import QThread,pyqtSignal class kuandaiSpiderThread(QThread): # 自定义信号,执行run()函数时,从相关线程发射此信号 sinOut = pyqtSignal(list) #用于数据爬完后发射 sinOut_lable=pyqtSignal(str) #用于爬完后下坐标呈现 def __init__(self, datas): super().__init__() self.kuandaihao_list = datas # cookie=cookie if cookie !='' else 'JSESSIONID=Bwq6Ts7HFqrEunwSQeIQtZSzEWlO62Iczgm2XpCk'#有没有cookie都没事 self.headers = { 'Accept': 'application/json, text/plain, */*', 'Accept-Encoding': 'gzip, deflate, br', 'Authorization': 'Basic c2YtaWRzOnNmLWlkcy1zZWNyZXQ=', 'Blade-Auth': 'bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzUxMiJ9.eyJpc3MiOiJpc3N1c2VyIiwiYXVkIjoiYXVkaWVuY2UiLCJ0ZW5hbnRfaWQiOiIwMDAwMDAiLCJ1c2VyX2lkIjoidHRfZHFfd2FuZ3hpbnpoaSIsInRydWVfbmFtZSI6IueOi-W_g-W_lyIsInVzZXJfbmFtZSI6InR0X2RxX3dhbmd4aW56aGkiLCJkZXRhaWwiOnsiaXNBZG1pbiI6ZmFsc2UsIm1hbmFnZWRQcm9kcyI6bnVsbCwiZGlzdHJpY3RJZCI6IkRJU1RSSUNULTAwMDAxLTAwMDE5LTAwMDA4IiwidGVsZXBob25lIjoiMCIsIm1vYmlsZVBob25lIjoiMTU3MTQ2MTAwODYiLCJkZXB0Q29kZSI6Ii0xIiwiZGVwdElkIjoiSklBS0VfU1lTVEVNX0RFUFQtNDAyODgwYjQ1YjhlMjQ0YzAxNWI4ZTM3NDA3ODAwMTEiLCJ1c2VyRGlzdHJpY3RJZCI6IkRJU1RSSUNULTAwMDAxLTAwMDE5LTAwMDA4IiwiYXJlYSI6bnVsbCwib3JnYW5pemF0aW9uIjpudWxsLCJlbWFpbCI6IjAiLCJ1c2VyVHlwZSI6IjAiLCJpc0xvY2tlZCI6IjAiLCJ1c2VySXAiOiIxMC4xMTIuMTI1LjMwIiwibG9naW5UaW1lIjoiMTcwODkwNzYxNDUzOSJ9LCJkZXB0X2lkIjoiSklBS0VfU1lTVEVNX0RFUFQtNDAyODgwYjQ1YjhlMjQ0YzAxNWI4ZTM3NDA3ODAwMTEiLCJ0b2tlbl90eXBlIjoiYWNjZXNzX3Rva2VuIiwiY2xpZW50X2lkIjoic2YtaWRzIiwiZXhwIjoxNzA4OTU4MDE0fQ.JXEZZ3EI-tBUiUvfzRA9q5SZxuDK74LLYQlWIH7zEk_rfszUIaOcDvREGEsnPZejODqnUn0h5UtP1XsnUTZbSg', 'Connection': 'keep-alive', 'Content-Length': '74', 'Content-Type': 'application/json;charset=UTF-8', 'Cookie': 'JSESSIONID=Bwq6Ts7HFqrEunwSQeIQtZSzEWlO62Iczgm2XpCk', 'Host': '10.151.71.133:18088', 'Referer': 'https://10.151.71.133:18088/ids-web/', 'Sec-Ch-Ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"', 'Sec-Ch-Ua-Mobile': '?1', 'Sec-Ch-Ua-Platform': '"Android"', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.289 Mobile Safari/537.36' } self.url='https://10.151.71.133:18088/bgw/sf-ids-service/cls/POOR_QUALITY_TAG_DETAIL/query_user_info' self.output_list=[] self.fail_output_list=[] def spider(self,headers,url,datas): ''' 用于爬宽带账号信息 ''' from urllib3.exceptions import InsecureRequestWarning import urllib3 urllib3.disable_warnings(InsecureRequestWarning) for kuandaihao in datas: data = { 'queryParam': {'varMap': {'accountNum': kuandaihao}}, 'singleRow': 'true' } dumpJsonData = json.dumps(data) response = requests.post(url=url, headers=headers, data=dumpJsonData, verify=False) json_data = json.loads(response.text) if json_data['data'] != []: self.output_list.append(json_data['data']) print(json_data['data']) else: if kuandaihao !='': self.fail_output_list.append(kuandaihao) def run(self): ''' 用于多线程进行爬数据,将数据分裂为二位数组,建立5个线程一起爬 :return: ''' start_time=time.time() threads = [] data_length=len(self.kuandaihao_list) step=int(data_length/5) if data_length>5: data_2wei_list = [self.kuandaihao_list[i:i + step] for i in range(0, data_length, step)] #二维数组[[],[]] else: data_2wei_list=[self.kuandaihao_list] for each_group in data_2wei_list: t = threading.Thread(target=self.spider, args=(self.headers,self.url,each_group,)) threads.append(t) t.start() for i in range(len(threads)): # 将主线程阻塞 threads[i].join() end_time=time.time() self.sinOut.emit(self.output_list) #弹出全量查询的数据 tips = f'查询完成!成功查询{len(self.output_list)}条数据!使用{end_time - start_time}秒!失败账号为{self.fail_output_list}!' self.sinOut_lable.emit(tips) #弹出完成信息,更新至下标lable class Window(QMainWindow, Ui_MainWindow): def __init__(self): super(QMainWindow, self).__init__() self.setupUi(self) # 渲染页面控件 self.connect_signals() # 设置信号槽 def connect_signals(self): self.btn_spider.clicked.connect(self.btn_spider_clicked) # 绑定开始爬虫查询 self.btn_get.clicked.connect(self.btn_openfile_clicked) #绑定打开excel表 def btn_spider_clicked(self): ''' 开始爬虫主线程 :return: ''' self.btn_spider.setEnabled(False) get_key = self.textEdit.toPlainText() #获取输入的字符串 datas=get_key.split('\n') #分割成宽带账号列表 # get_cookie=self.edit_cookie.text()#不需要cookie,都能爬 self.thread = kuandaiSpiderThread(datas) self.thread.sinOut.connect(self.data_show) self.thread.sinOut_lable.connect(self.tips_show) self.thread.start() def tips_show(self,tips): self.low_lable.setText(tips) def data_show(self,kuandai_details): self.btn_spider.setEnabled(True) data = pd.DataFrame(kuandai_details) try: data.to_excel(r'./data.xlsx') except Exception as e : print(e) QMessageBox.warning(self, '警告',f'请关闭需要保存的excel文件:{e}', QMessageBox.Close, ) self.model = QStandardItemModel(data.shape[0], data.shape[1]) # 设置水平方向的标头内容 self.model.setHorizontalHeaderLabels(data.columns.values) for row in range(data.shape[0]): for column in range(data.shape[1]): sss = data[data.columns.values[column]][data.index.values[row]] sss = str(sss) item = QStandardItem(sss) # 设置每个位置的文本值 self.model.setItem(row, column, item) # # 实例化表格视图,设置模型为自定义的模型 self.table_view.setModel(self.model) # 设置布局 layout = QVBoxLayout() layout.addWidget(self.table_view) self.setLayout(layout) def btn_openfile_clicked(self): import os os.startfile(r'.\data.xlsx') def main(): app = QApplication(sys.argv) mywindow = Window() mywindow.show() sys.exit(app.exec_()) if __name__ == "__main__": main()