爬虫动态网页实训

节点互动案例代码和运行结果截图

from selenium import webdriver
import time,random

# 设置浏览器隐藏
# option = webdriver.ChromeOptions()
# option.add_argument("--headless")

browser = webdriver.Chrome()
try:
    browser.get('http://localhost/demo1.html')
    buttons = browser.find_elements_by_class_name('mybutton')
    for i in range(10):
        i = random.randint(0, 5)
        buttons[i].click()
        print('当前正在点击按钮',i+1)
        time.sleep(1)
except Exception as e:
    print(e)
    browser.close()

image

爬取网页Java图书信息

import requests
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from time import sleep
driver = webdriver.Chrome()
wait = WebDriverWait(driver,4)
driver.get('http://www.ptpress.com.cn/search/books')
input = driver.find_element_by_id('searchVal')
input.send_keys('Java')
button = driver.find_element_by_xpath('//div[@class="search_main down_search"]/button')
button.click()
sleep(5)
divs = driver.find_elements_by_class_name('book_item')
for div in divs:
    img = div.find_elements_by_tag_name('img')[0].get_attribute('src') #依旧是WebElement对象
    title = div.text
    print('当前下载:',title,"   ",img)
    fileurl = "./实训二/" + title + ".jpg"
    r = requests.get(img)
    with open(fileurl, 'wb') as f:
        f.write(r.content)
driver.close()

image

将数据存储到MongoDB数据库中

import pymongo,requests,json
ua = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) Chrome/65.0.3325.181'}
url = 'https://www.ptpress.com.cn/bookinfo/getBookListForWS'
client = pymongo.MongoClient('localhost:27017')
db = client['test']
col = db['webspider']
res = requests.get(url,headers = ua).text
data = json.loads(res)
news = data['data']
for i in news:
    name = i['bookName']
    author = i['author']
    price = i['price']
    a = {'name':name,'author':author,'price':price}
    col.insert_one(a)
    print(a)

image

posted @ 2022-04-20 18:15  anyiya  阅读(35)  评论(0编辑  收藏  举报