python3 多线程 采集 xpath
#!/usr/bin/python # -*- coding: UTF-8 -*- '''Thread3 多线程测试采集''' import threading,time,queue,Mongo_utils,mysqlUtils,requests,json,os from lxml import html etree = html.etree exitFlag = 0 db = Mongo_utils.mongodb_15_27017task() table = db["xx_anjuke_agent1"] table_urls = db["xx_spider_urls1"] list_pro = mysqlUtils.select_pro() list_urls = table_urls.find().limit(2000) insert_list = [] del_list = [] class myThread(threading.Thread): def __init__(self,threadId,name,q): threading.Thread.__init__(self) self.threadId = threadId self.name = name self.q = q def run(self): print("开始线程" + self.name) spider(self.name,self.q) print("退出线程" + self.name) def head(): headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "cache-control": "max-age=0", "upgrade-insecure-requests": "1", "Connection": "keep-alive", "Content-Type": "text/html; charset=UTF-8", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", } return headers def spider(name,q): while not exitFlag: queueLock.acquire() if not workQueue.empty(): i = q.get() queueLock.release() _id = i["_id"] city = i["city"] zone = i["zone"] street = i["street"] urls = i["urls"] headers = head() try: url = "https://。。。。。。。。。。。" % _id # //,proxies=proxy response_contact = requests.session().get(url=url, allow_redirects=False, headers=headers, timeout=1) print(response_contact.status_code) if response_contact.status_code == 302: print("验证") print(url) os._exit(0) res = json.loads(response_contact.text) contact = res['data'] response_dl = requests.session().get(url=urls, allow_redirects=False, headers=headers, timeout=1) if response_dl.status_code == 302: print("验证") print(urls) os._exit(0) if ("获取成功") not in response_contact.text or ("房屋编码") not in response_dl.text: print("pass") pass html = etree.HTML(response_dl.content) name = html.xpath("//div[@class='brokercard-name']/text()")[0].strip() company = html.xpath("//div[@class='broker-company']/p[1]/a/text()")[0] company_url = html.xpath("//div[@class='broker-company']/p[1]/a/@href")[0] store = html.xpath("//div[@class='broker-company']/p[2]/span/text()")[0] # re = name, company, company_url, store, contact,_id,city,zone,street staffNo = "https://anjuxingye1.anjuke.com/gongsi-jjr-%s/" % _id mydict = {"_id": _id, "city": city, "zone": zone, "street": street, "name": name, "company": company, "company_url": company_url, "store": store, "site": "anjuke", "store_url": "", "staffNo": "", "store_url": "", "staffNo": staffNo, "tag": "8", "all_comm": "" , "contact": contact} insert_list.append(mydict) # del_list.append(urls) print("size: %s" % insert_list.__len__()) except: pass print("%s processing %s" % (name, i)) else: queueLock.release() # time.sleep(1) threadList = range(0,5) queueLock = threading.Lock() workQueue = queue.Queue(50000) threads = [] threadID = 1 for tName in threadList: thread = myThread(threadID, tName, workQueue) thread.start() threads.append(thread) threadID += 1 # 填充队列 queueLock.acquire() for word in list_urls: workQueue.put(word) queueLock.release() # 等待队列清空 while not workQueue.empty(): pass if insert_list.__len__() > 10: try: table.insert_many(insert_list, ordered=False) # table_urls.remove({"urls": {"$in": del_list}}) print("插入1000") except Exception as e: print(e) insert_list.clear() del_list.clear() # 通知线程是时候退出 # os._exit(0) exitFlag = 1 try: table.insert_many(insert_list, ordered=False) # table_urls.remove({"urls": {"$in": del_list}}) print("插入1000") except: pass insert_list.clear() del_list.clear() # 等待所有线程完成 for t in threads: t.join() print ("退出主线程")
转载注明出处
如果本文对你有帮助,请帮忙啦~~
打开支付宝首页搜“522901509”领红包,领到大红包的小伙伴赶紧使用哦!