python爬虫学习——urllib库

#获取一个get请求
#import urllib.request
# response = urllib.request.urlopen("http://www.baidu.com")
# print(response.read().decode('utf-8'))   #对获取到的网页源码进行utf-8解码

'''
#获取一个post请求
import urllib.request
import urllib.parse
data = bytes(urllib.parse.urlencode({"hello":"world"}),encoding = "utf-8")
response = urllib.request.urlopen("http://httpbin.org/post",data = data)
print(response.read().decode("utf-8"))
'''
'''
#超时处理
import urllib.request
try:
    response = urllib.request.urlopen("http://httpbin.org/get",timeout=0.01)
    print(response.read().decode("utf-8"))
except urllib.error.URLError as e:
    print("time out!")
'''
'''
#可以获取的数据
import urllib.request
response = urllib.request.urlopen("http://www.baidu.com")
# print(response.status)
print(response.getheader("Server"))
'''
'''
#伪装成浏览器
import urllib.request
import urllib.parse
url = "http://httpbin.org/post"
data = bytes(urllib.parse.urlencode({"name":"eric"}),encoding='utf-8')
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50",
    "X-Amzn-Trace-Id": "Root=1-63f48078-2f75544f15e5c54a7b905e25"
}
req = urllib.request.Request(url=url,data=data,headers=headers,method="POST")
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
'''
#伪装浏览器爬取豆瓣
import urllib.request
url = "https://www.douban.com"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50",
    "X-Amzn-Trace-Id": "Root=1-63f48078-2f75544f15e5c54a7b905e25"
}
req = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
posted @ 2023-02-21 17:22  鹤城  阅读(27)  评论(0编辑  收藏  举报