【Python3】【爬虫】bilibili摄影板块
0x00准备
- B站的小姐姐炒鸡漂亮(逃,这个真的是技术贴。
- 第一次抓动态网站:原理是直接模拟浏览器访问。
0x01环境
1.python3不说了
2.selenium安装报错解决方案:
安装:
pip install selenium
报错1:
During handling of the above exception, another exception occurred:
安装chromedirve解决。
报错2:
FileNotFoundError: [WinError 2] 系统找不到指定的文件。
from selenium import webdriver
#driver = webdriver.Chrome("C:\Development\&GwjEnvironment\chromedriver.exe") #错误的路径 使用正斜杠
driver = webdriver.Chrome("C:/Development/&GwjEnvironment/chromedriver.exe") #正确的路径 使用反斜杠
driver.get("http://www.baidu.com")
更正:
找到并且全部改成shell=True
C:\Development\Python36\Lib\subprocess.py
3.要装对应浏览器的调试工具。
比如我的chrome就是chromedriver_win32
4.需要的库自己看代码里面。
0x02丢图跑
好吧这个真的不是技术贴。
代码2(用js):
# 抓取了B站cosplay热门摄影
import os
from selenium import webdriver
from bs4 import BeautifulSoup
import ssl
from time import sleep
import requests
import random
import re
import json
UserAgent_List = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
headers = {'User-Agent': random.choice(UserAgent_List),
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
'Accept-Encoding': 'gzip',
}
base_url = 'https://h.bilibili.com/eden/picture_area#/cos/hot'
pic_save_path = "output/"
# 打开浏览器模拟请求
def browser_get(pageNum):
browser = webdriver.Chrome()
browser.get(base_url)
h = int(int(pageNum)/20)
for i in range(h):
browser.execute_script("window.scrollBy(0,3000)")
sleep(2)
html_text = browser.page_source
soup = BeautifulSoup(html_text,'html.parser')
urls = soup.find('div',{'class':'area-wrapper'}).findAll('h3',{'class':'article-title'})
# print(len(urls))
count = 1
for url in urls:
test = 'https://api.vc.bilibili.com/link_draw/v1/doc/detail?doc_id='+re.sub('\D',"",url.a['href']) #regex初体验
browser.get(test)
js = browser.page_source # 怎么直接得到json???
# print(js)
sleep(2) # 异步加载
get_meizi_url(js)
count += 1
if count>int(pageNum):break
browser.quit()
# 获取每个页面的小姐姐
def get_meizi_url(js):
# print(js)
soup = BeautifulSoup(js, 'html.parser')
text = soup.find('pre').string
hhh = json.loads(text)
title = ""
for i in hhh['data']['item']['title']:
title = title+i
if not os.path.exists(title):
os.makedirs(title)
else:
return
print(title)
count = 1
for i in hhh['data']['item']['pictures']:
print(i['img_src'])
qaq = re.search(r'(jpg)|(webp)|(png)|(jpeg)',i['img_src'])
filename = '%s/%s/%s.%s'%(os.path.abspath('.'),title,count,qaq.group())
with open(filename,'wb+')as qwq:
qwq.write(requests.get(i['img_src'],headers=headers).content)
count += 1
return
if __name__ == '__main__':
ssl._create_default_https_context = ssl._create_unverified_context #https问题
pageNum = input(u'请问你要几份小姐姐照片:')
#if not os.path.exists(pic_save_path):
# os.makedirs(pic_save_path)
browser_get(pageNum)
代码1(纯html):
# 抓取了B站cosplay热门摄影
# 72行有个BUG,B站格式随机,需要正则表达式匹配,占坑待填。
import os
from selenium import webdriver
from bs4 import BeautifulSoup
import ssl
from time import sleep
import requests
import random
UserAgent_List = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
headers = {'User-Agent': random.choice(UserAgent_List),
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
'Accept-Encoding': 'gzip',
}
base_url = 'https://h.bilibili.com/eden/picture_area#/cos/hot'
pic_save_path = "output/"
# 打开浏览器模拟请求
def browser_get(pageNum):
browser = webdriver.Chrome()
browser.get(base_url)
h = int(int(pageNum)/20)
for i in range(h):
browser.execute_script("window.scrollBy(0,3000)")
sleep(2)
html_text = browser.page_source
soup = BeautifulSoup(html_text,'html.parser')
urls = soup.find('div',{'class':'area-wrapper'}).findAll('h3',{'class':'article-title'})
# print(len(urls))
count = 1
for url in urls:
browser.get('https://'+url.a['href'])
sleep(2) # 异步加载
html = browser.page_source
get_meizi_url(html)
count += 1
if count>int(pageNum):break
browser.quit()
# 获取每个页面的小姐姐
def get_meizi_url(html):
# print(html)
soup = BeautifulSoup(html, 'html.parser')
title = soup.find('h1',attrs={'class':'article-title dp-i-block v-middle'}).string
if not os.path.exists(title):
os.makedirs(title)
print(title)
href = soup.find('div',attrs={'class':'images'}).findAll('img')
count = 1
for a in href:
print(a['src'])
filename = '%s/%s/%s.jpg'%(os.path.abspath('.'),title,count)
with open(filename,'wb+')as qwq:
qwq.write(requests.get(a['src'],headers=headers).content)
count += 1
if __name__ == '__main__':
ssl._create_default_https_context = ssl._create_unverified_context #https问题
pageNum = input(u'请问你要几份小姐姐照片:')
#if not os.path.exists(pic_save_path):
# os.makedirs(pic_save_path)
browser_get(pageNum)
反正我偷图跑路成功了,代码挂了联系博主,欢迎大佬带找js。