import urllib.request

#创建一个ruquest对象
url="https://tieba.baidu.com/p/6310762577"
request=urllib.request.Request(url)
#连接url，返回response对象
response=urllib.request.urlopen(request)
#获取内容数据
html=response.read()#read(方法)
#设置内容为utf-8编码
html=html.decode("utf-8")
html

'\n<!DOCTYPE html><!--STATUS OK--><html><head><meta name="keywords" content="百度贴吧,美图骑单,车的"/><meta name="description" content="骑单车的人..骑单车的人" /><meta charset="UTF-8"><meta furl="tieba.baidu.com/f?kw=%E7%BE%8E%E5%9B%BE&ie=utf-8" fname="美图"><meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"><meta name="baidu-site-verification" content="jpBCrwX689" /><link rel="search" type="application/opensearchdescription+xml" href="/tb/cms/content-search.xml" title="百度贴吧" /><title>骑单车的人_美图吧_百度贴吧</title><script type="text/javascript">void function(t,e,n,a,o,i,r){t.alogObjectName=o,t[o]=t[o]||function(){(t[o].q=t[o].q||[]).push(arguments)},t[o].l=t[o].l||+new Date,a="https:"===t.location.protocol?"https://fex.bdstatic.com"+a:"http://fex.bdstatic.com"+a;var c=!0;if(t.alogObjectConfig&&t.alogObjectConfig.sample){var s=Math.random();t.alogObjectConfig.rand=s,s>t.alogObjectConfig.sample&&(c=!1)}c&&(i=e.createElement(n),i.async=!0,i.src=a+"?v="+~(new Date/864e5)+~(new Date/864e5),r=e.getElementsByTagName(n)[0],r.parentNode.insertBefore(i,r))}(window,document,"script","/hunter/alog/alog.min.js","alog"),void function(){function t(){}window.PDC={mark:function(t,e){alog("speed.set",t,e||+new Date),alog.fire&&alog.fire("mark")},init:function(t){alog("speed.set","options",t)}

用正则表达式解析内容

#导入正则表达式包
import re

#创建一个正则对象
str='src="(.+?.jpg)" size'
imger=re.compile(str)
imglist=re.findall(imger,html)#找打所有图片
imglist

['https://imgsa.baidu.com/forum/w%3D580/sign=4b966626e4c4b7453494b71efffd1e78/aeea0b55b319ebc4f68696968d26cffc1f17161a.jpg',
 'https://imgsa.baidu.com/forum/w%3D580/sign=4172a3f9a8efce1bea2bc8c29f50f3e8/30ca91ef76c6a7ef1f864829f2faaf51f2de667d.jpg',
 'https://imgsa.baidu.com/forum/w%3D580/sign=f1a090968d26cffc692abfba89004a7d/3c1fb64543a982268919cd288582b9014890ebc5.jpg']

#把imglist保存到目录下
import  os
import time
i=1
#if not os.path.exists("imagess"):
    #os.mkdir("imagrss")#系统里面不存在”images“就创建一个
for img in imglist:
    time.sleep(1)#休息1秒抓取下一张
    urllib.request.urlretrieve(img,"C:/Users/1/Desktop/{}.jpg".format(i))
    i=i+1
print("爬虫结束")

爬虫结束  只上传一张0.0

get方法抓取

import urllib.request

#在淘宝转化浏览设备为6plus，F12在JS中找到内容
url="https://suggest.taobao.com/sug?q=python+%E7%88%AC%E8%99%AB&code=utf-8&area=c2c&nick=&sid=null&callback=jsonp157311230486278746"
#连接url，返回response对象
response=urllib.request.urlopen(url)
html=response.read().decode("utf8")
html

'\r\njsonp157311230486278746({"result":[["python爬虫<b>书籍<\\/b>","690.2666666666667"],["python<b>3网络<\\/b>爬虫<b>开发实战<\\/b>","396.02"],["python<b>3网络<\\/b>爬虫<b>开发<\\/b>","429.95348837209303"],["python爬虫<b>实战<\\/b>","1046.909090909091"],["python爬虫<b>课程网课<\\/b>","107.2"],["python<b>3网络<\\/b>爬虫","473.5"],["python爬虫<b>源代码<\\/b>","110.9375"],["python爬虫<b>项目<\\/b>","1116.8333333333333"],["python<b>网络<\\/b>爬虫<b>实战<\\/b>","658.6"],["python爬虫<b>入门<\\/b>","1084.969696969697"]]})'

#把json的字符串格式转换成python的字典类型

{'result': [['python爬虫<b>书籍</b>', '690.2666666666667'],
  ['python<b>3网络</b>爬虫<b>开发实战</b>', '396.02'],
  ['python<b>3网络</b>爬虫<b>开发</b>', '429.95348837209303'],
  ['python爬虫<b>实战</b>', '1046.909090909091'],
  ['python爬虫<b>课程网课</b>', '107.2'],
  ['python<b>3网络</b>爬虫', '473.5'],
  ['python爬虫<b>源代码</b>', '110.9375'],
  ['python爬虫<b>项目</b>', '1116.8333333333333'],
  ['python<b>网络</b>爬虫<b>实战</b>', '658.6'],
  ['python爬虫<b>入门</b>', '1084.969696969697']]}

a=html[26:452]

for item in dic["result"]:
    print(item[0].replace("<b>","").replace("</b>",""))#replace字符串替换函数

python爬虫书籍
python3网络爬虫开发实战
python3网络爬虫开发
python爬虫实战
python爬虫课程网课
python3网络爬虫
python爬虫源代码
python爬虫项目
python网络爬虫实战
python爬虫入门

#python 自带工具可以做编码‘
str=urllib.request.quote("牛仔衣")
str

'%E7%89%9B%E4%BB%94%E8%A1%A3'

#加入user-agent，冒充浏览器 user-agent可在浏览器中F12 找到，且必须以字典格式传入
header={"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"}
keyword = '羽绒服'
url = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(keyword)
response = requests.get(url,headers=header)
dic = json.loads(response.text)
for item in dic['result']:
    print(item[0])

​

11-21

python 爬虫实列

用正则表达式解析内容

get方法抓取

加入user-agent，冒充浏览器 user-agent可在浏览器中F12 找到，且必须以字典格式传入

设置代理IP

导航

公告