爬虫基础知识
爬虫基础知识
请求方式
#获取一个post请求
import urllib.parse
import urllib.request
data = bytes(urllib.parse.urlencode({"hello":"world"}),encoding = "utf-8")
#用于模拟网站密码账号的登录,后续的还需cookies
response = urllib.request.urlopen("https://httpbin.org/post",data = data)
print(response.read().decode("utf-8"))
#获取get请求
import urllib.request
#用于模拟网站密码账号的登录,后续的还需cookie
response = urllib.request.urlopen("https://httpbin.org/get")
print(response.read().decode("utf-8"))
超时处理
#超时处理
import urllib.request
try:
response = urllib.request.urlopen("https://httpbin.org/get",timeout = 1)#如果超过1秒还没结果
print(response.read().decode("utf-8"))
except urllib.error.URLError as e: #错误类型
print("time out!")
对豆瓣电影进行信息的爬取
import urllib.request
url = "http://www.douban.com"
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"}
#伪装成浏览器的信息
req = urllib.request.Request(url=url,headers=headers)#将其打包封装
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
记录学习的点点滴滴