python 爬虫实列
In [5]:
import urllib.request
In [6]:
#创建一个ruquest对象 url="https://tieba.baidu.com/p/6310762577" request=urllib.request.Request(url) #连接url,返回response对象 response=urllib.request.urlopen(request) #获取内容数据 html=response.read()#read(方法) #设置内容为utf-8编码 html=html.decode("utf-8") html
Out[6]:
用正则表达式解析内容
In [7]:
#导入正则表达式包 import re
In [8]:
#创建一个正则对象 str='src="(.+?.jpg)" size' imger=re.compile(str) imglist=re.findall(imger,html)#找打所有图片 imglist
Out[8]:
In [9]:
#把imglist保存到目录下 import os import time i=1 #if not os.path.exists("imagess"): #os.mkdir("imagrss")#系统里面不存在”images“就创建一个 for img in imglist: time.sleep(1)#休息1秒抓取下一张 urllib.request.urlretrieve(img,"C:/Users/1/Desktop/{}.jpg".format(i)) i=i+1 print("爬虫结束")
get方法抓取
In [10]:
import urllib.request
In [25]:
#在淘宝转化浏览设备为6plus,F12在JS中找到内容 url="https://suggest.taobao.com/sug?q=python+%E7%88%AC%E8%99%AB&code=utf-8&area=c2c&nick=&sid=null&callback=jsonp157311230486278746" #连接url,返回response对象 response=urllib.request.urlopen(url) html=response.read().decode("utf8") html
Out[25]:
In [42]:
#把json的字符串格式转换成python的字典类型
import json dic=json.loads(a) dic #注json不支持单引号,包含单引号的字符串会解析失败
Out[42]:
In [41]:
a=html[26:452]
In [54]:
for item in dic["result"]: print(item[0].replace("<b>","").replace("</b>",""))#replace字符串替换函数
In [45]:
#python 自带工具可以做编码‘ str=urllib.request.quote("牛仔衣") str
Out[45]:
In [56]:
import requests import json keyword = '连衣裙' # 不需要进行URL的编码 url = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(keyword) response = requests.get(url) html = response.text # 获取响应体的内容 dic = json.loads(html) for item in dic['result']: print(item[0])
连衣裙女秋冬 连衣裙2019新款秋 连衣裙女 连衣裙2019秋款新 连衣裙夏 连衣裙女春秋 连衣裙长款秋冬 连衣裙收腰显瘦 气质 连衣裙两件套秋冬 连衣裙长裙女秋冬
#搜索连衣裙二级菜单 import requests import json #定义一个函数,替换返回值中的特殊格式 def replace_str(html): html = html.replace('<b>','') html = html.replace('<\/b>','') return html keyword = '连衣裙' url = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(keyword) response = requests.get(url) dic = json.loads(response.text) for item in dic['result']: print(item[0]) url2 = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(item[0]) response2 = requests.get(url2) content = replace_str(response2.text) dic2 = json.loads(content) for item2 in dic2['result']: print('\t'+item2[0])
连衣裙2018款新款 连衣裙2018款新款女 连衣裙2018款新款女 雪纺 连衣裙2018夏新款女 中长款 修身 裙子女夏2018新款 中长款连衣裙 裙子夏女2018新款 款连衣裙 气质 连衣裙夏女2018新款 中长款 气质 女裙子2018新款 中长款连衣裙 女夏2018新款连衣裙长 款35岁 女装2018新款中长连衣裙 中长款 连衣裙女夏2018新款拉链款 连衣裙夏季新款 连衣裙夏季新款女 2018 气质 a字 连衣裙夏季新款 雪纺 2018 连衣裙夏季新款 宽松 韩版 2018 连衣裙夏季新款女 名媛 连衣裙夏季新款 2018韩版孕妇装 连衣裙夏季新款2018显瘦超仙 连衣裙夏季新款女装欧美中裙 连衣裙夏季新款女欧洲站2018 连衣裙夏季新款女韩版 棉麻 连衣裙夏季新款 时尚气质
反爬虫机制
加入user-agent,冒充浏览器 user-agent可在浏览器中F12 找到,且必须以字典格式传入
#加入user-agent,冒充浏览器 user-agent可在浏览器中F12 找到,且必须以字典格式传入 header={"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"} keyword = '羽绒服' url = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(keyword) response = requests.get(url,headers=header) dic = json.loads(response.text) for item in dic['result']: print(item[0])
羽绒服女中长款 修身显瘦 羽绒服女轻薄款 羽绒服中长款女 羽绒服女长款2017新款 韩版 潮 羽绒服 枣红色 羽绒服女收腰 羽绒服男轻薄 羽绒服女长款 冬季 中长款 羽绒服套装 羽绒服女长款收腰过膝
设置代理IP
#设置代理IP proxies = {"HTTP":"122.114.31.177:808"} #百度搜索代理IP header={"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"} keyword = '西装' url = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(keyword) response = requests.get(url,headers=header,proxies=proxies) dic = json.loads(response.text) for item in dic['result']: print(item[0])
西装男套装 青少年 西装短裤套装女 西装 修身连衣裙 西装领长袖女 西装 短袖 西装v领 连衣裙 收腰 西装喇叭裤高腰 西装裤 夏 女 西装春秋女 西装热裤女
# 把数据写到文件中 #搜索连衣裙二级菜单 import requests import json import pandas as pd #定义一个函数,替换返回值中的特殊格式 def replace_str(html): html = html.replace('<b>','') html = html.replace('<\/b>','') return html keyword = '连衣裙' proxies = {"HTTP":"122.114.31.177:808"} header={"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"} url = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(keyword) response = requests.get(url,headers=header,proxies=proxies) dic = json.loads(response.text) lst=[] for item in dic['result']: lst.append([item[0],1]) url2 = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(item[0]) response2 = requests.get(url2,headers=header,proxies=proxies) content = replace_str(response2.text) dic2 = json.loads(content) for item2 in dic2['result']: lst.append([item2[0],2]) #写到文件中 data = pd.DataFrame(lst,columns=['title','level']) data.to_csv('./lyq.csv',index=False,header=True) print('运行结束')
# 写一个爬虫—Post 方式抓取有道翻译数据 import requests import json def translate(word): url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule" header={ "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Connection":"keep-alive", "Content-Length": "254", "Content-Type": "application/x-www-form-urlencoded;", "Cookie": "OUTFOX_SEARCH_USER_ID=-2118096325@10.168.8.63; OUTFOX_SEARCH_USER_ID_NCOO=529585232.72911924; fanyi-ad-id=44547; fanyi-ad-closed=1; DICT_UGC=be3af0da19b5c5e6aa4e17bd8d90b28a|; JSESSIONID=abcm1o-ND2W6Y9HCq9vqw; _ntes_nnid=a7d939a02cd5c3865942d2a2051b410e,1529376772962; ___rl__test__cookies=1529398225385", "Host": "fanyi.youdao.com", "Origin": "http://fanyi.youdao.com", "Referer": "http://fanyi.youdao.com/", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36", "X-Requested-With":"XMLHttpRequest" } payload = { "i": word, "from": "AUTO", "to": "AUTO", "smartresult": "dict", "client": "fanyideskweb", "salt": "1529398225392", "sign": "bf09bc9795dfc7863516162c961fd97e", "doctype": "json", "version": "2.1", "keyfrom": "fanyi.web", "action": "FY_BY_CLICKBUTTION", "typoResult":"false" } response = requests.post(url,data=payload,headers=header) dic = json.loads(response.text) print(dic['translateResult'][0][0]['tgt']) if __name__=='__main__': translate('中国')
China