爬虫的过程

# 1.爬取到的网页直接写入文件
import urllib.request

urllib.request.urlretrieve("http://www.baidu.com",
filename=r'D:\pyhon_jichu\test_file\file1.html')

# urlretrieve在执行的过程中,会产生一些缓存,需要清除缓存
urllib.request.urlcleanup()


============================================================================================

#2.模拟浏览器爬虫
import urllib.request

url = "http://www.baidu.com"

# 如果别人的网站设置了防爬虫,我们就什么也爬不了,所以我们要伪装成浏览器

# 1.模拟请求头
# User-Agent:包含浏览器的内核,版本等信息
headers = {
"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Mobile Safari/537.36"
}

# 2.设置一个请求体(可以把请求体当做一个网址来看待)
req = urllib.request.Request(url,headers=headers)

# 3.发起请求
response = urllib.request.urlopen(req)
data = response.read().decode("utf-8")
print(data)
"""
import urllib.request

import random
url = "http://www.baidu.com"


agnetList = [
"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Mobile Safari/537.36"
]

# 随机拿一个
agentStr = random.choice(agnetList)

req = urllib.request.Request(url)

# 向请求体里添加了User-Agent
req.add_header("User-Agent",agentStr)

# 发起请求
response = urllib.request.urlopen(req)

print(response.read().decode("utf-8"))

===================================================================================================

#3.设置超时
import urllib.request
# 如果网页长时间未响应,系统判断超时,无法爬取
for i in range(1,100):
try:
response = urllib.request.urlopen( "http://www.baidu.com",timeout=0.5)
print(len(response.read().decode("utf-8")))
except:
print('请求超时,继续下一个爬取')



posted @ 2020-03-25 07:42  乘风破浪的小落夜  阅读(134)  评论(0编辑  收藏  举报