第一次调试
from threading import Thread
from fake_useragent import UserAgent
import requests
from time import sleep
for i in range(1,11):
url = f"https://www.hupu.com/home/v1/news?pageNo={i}&pageSize=50"
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}
resp = requests.get(url, headers=headers)
# print(resp.json())
for d in resp.json().get("data"):
print(f'tid:{d.get("tid")}topic:{d.get("topicName")}content:{d.get("content")}')
sleep(3)
if resp.status_code == 200:
print(f'成功获取第{i}页数据')
安雨:
老师,print(f'成功获取第{i}页数据')
安雨:
老师,这行代码怎么老是?第一页第一页
安雨:
怎么不起作用啊?
虚竹:
嵌套循环
虚竹:
外层一次,内层执行完毕,外层才会进入第二次
注意if 的位置
```plaintext
from threading import Thread
from fake_useragent import UserAgent
import requests
from time import sleep
def spider():
for i in range(1,11):
url = f"https://www.hupu.com/home/v1/news?pageNo={i}&pageSize=50"
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}
resp = requests.get(url, headers=headers)
# print(resp.json())
for d in resp.json().get("data"):
print(f'tid:{d.get("tid")}topic:{d.get("topicName")}content:{d.get("content")}')
sleep(3)
if resp.status_code == 200:
print(f'成功获取第{i}页数据')
if __name__=="__main__":
t1= Thread(target=spider)
t1.start()
t2 = Thread(target=spider)
t2.start()
线程执行同一地址
```plaintext
from threading import Thread
from fake_useragent import UserAgent
import requests
from time import sleep
from queue import Queue
def spider():
# 打印获取的地址
print(url_queue.get())
# headers = {
# "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
# }
# resp = requests.get(url, headers=headers)
# # print(resp.json())
#
# for d in resp.json().get("data"):
# print(f'tid:{d.get("tid")}topic:{d.get("topicName")}content:{d.get("content")}')
# sleep(3)
# if resp.status_code == 200:
# print(f'成功获取第{i}页数据')
if __name__=="__main__":
url_queue=Queue()
for i in range(1,11):
url = f"https://www.hupu.com/home/v1/news?pageNo={i}&pageSize=50"
url_queue.put(url)
for i in range(3):
t1=Thread(target=spider)
t1.start()
from threading import Thread
from fake_useragent import UserAgent
import requests
from time import sleep
from queue import Queue
class MyThread(Thread):
def __init__(self,i):
Thread.__init__(self)
self.i=i
def run(self):
while not url_queue.empty():
url = url_queue.get()
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}
print(url)
resp = requests.get(url, headers=headers)
# # print(resp.json())
#
# for d in resp.json().get("data"):
# print(f'tid:{d.get("tid")}topic:{d.get("topicName")}content:{d.get("content")}')
sleep(3)
if resp.status_code == 200:
print(f'成功获取第{self.i}页数据')
if __name__ == "__main__":
url_queue = Queue()
for i in range(1, 11):
url = f"https://www.hupu.com/home/v1/news?pageNo={i}&pageSize=50"
url_queue.put(url)
for i in range(2):
t1 = MyThread()
t1.start()