爬取京东商城某件商品信息
通过Selenium抓取京东商城某件商品(如“ThinkPad”)的前3页的信息,包括:标题、价格、图片链接、评论数和商品名称,并将这些信息存储至数据库中。
import time import pymysql from bs4 import BeautifulSoup from urllib.parse import quote from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.common.exceptions import TimeoutException from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC browser = webdriver.Chrome() wait = WebDriverWait(browser, 10) # KEYWORD = 'ThinkPad' def index_page(page): #抓取商品列表页 print('正在爬取第', page, '页') try: url = 'https://search.jd.com/Search?keyword=' + quote(KEYWORD) # browser.get(url) time.sleep(1) browser.execute_script('window.scrollTo(0, document.body.scrollHeight)') #将进度条下拉到最底部 time.sleep(2) wait.until(EC.presence_of_all_elements_located((By.XPATH, '//li[@class="gl-item"]'))) if page > 1: input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > input'))) input.clear() input.send_keys(page) input.send_keys(Keys.ENTER) time.sleep(1) browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')#将进度条下拉到最底部 time.sleep(2) wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.curr'), str(page))) get_products() except TimeoutException: # index_page(page) print('index_page: TimeoutException') def get_products(): html = browser.page_source soup = BeautifulSoup(html, 'lxml') #标题、价格、图片链接、评论数和商品名称 for li in soup.select('#J_goodsList li.gl-item'): image = li.select('.p-img > a > img')[0]['data-lazy-img'] if image == 'done': image = li.select('.p-img > a > img')[0].attrs['src'] #注意不要写成atrrs product = { 'image': image, 'title': li.select('.p-img a')[0].attrs['title'], 'price': li.select('.p-price > strong')[0].get_text(), 'commit': li.select('.p-commit > strong > a')[0].get_text(), 'shop': li.select('.p-shop > span > a')[0].get_text(), } print(product) write_to_sql(product) def create_sql(): db = pymysql.connect(host='localhost',user='root',password='123456',port=3306) cursor = db.cursor() cursor.execute("CREATE DATABASE spiders DEFAULT CHARACTER SET utf8") #创建数据库spiders db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spiders') cursor = db.cursor() sql= "CREATE TABLE JD (image VARCHAR(255) NOT NULL,title VARCHAR(255) NOT NULL,price VARCHAR(255) NOT NULL,commit VARCHAR(255) NOT NULL,shop VARCHAR(255) NOT NULL)" cursor.execute(sql) #创建数据表JD db.close() def write_to_sql(data): table = 'JD' keys = ', '.join(data.keys()) values = ', '.join(['%s'] * len(data)) sql = 'INSERT INTO {table}({keys}) VALUES ({values}) ON DUPLICATE KEY UPDATE '.format(table=table, keys=keys, values=values) update = ','.join(["{key} = %s".format(key=key) for key in data]) sql += update db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spiders') cursor = db.cursor() try: if cursor.execute(sql, tuple(data.values())*2): print('Successful') db.commit() except: print('Failed') db.rollback() db.close() def main(): create_sql() for i in range(1, 4): index_page(i) time.sleep(5) main()
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 【自荐】一款简洁、开源的在线白板工具 Drawnix
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY
· Docker 太简单,K8s 太复杂?w7panel 让容器管理更轻松!