无注释版
复制 | import pymongo |
| from selenium import webdriver |
| from selenium.common.exceptions import TimeoutException |
| from selenium.webdriver.common.by import By |
| from selenium.webdriver.support import expected_conditions as EC |
| from selenium.webdriver.support.wait import WebDriverWait |
| from pyquery import PyQuery as pq |
| from urllib.parse import quote |
| |
| |
| |
| |
| chrome_options = webdriver.ChromeOptions() |
| chrome_options.add_argument('--headless') |
| browser = webdriver.Chrome(chrome_options=chrome_options) |
| |
| |
| MONGO_URL = 'localhost' |
| MONGO_DB = 'taobao' |
| MONGO_COLLECTION = 'products' |
| |
| KEYWORD = 'ipad' |
| |
| MAX_PAGE = 100 |
| |
| SERVICE_ARGS = ['--load-images=false', '--disk-cache=true'] |
| |
| |
| wait = WebDriverWait(browser, 10) |
| client = pymongo.MongoClient(MONGO_URL) |
| db = client[MONGO_DB] |
| |
| |
| def index_page(page): |
| """ |
| 抓取索引页 |
| :param page: 页码 |
| """ |
| print('正在爬取第', page, '页') |
| try: |
| url = 'https://s.taobao.com/search?q=' + quote(KEYWORD) |
| browser.get(url) |
| if page > 1: |
| input = wait.until( |
| EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input'))) |
| submit = wait.until( |
| EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit'))) |
| input.clear() |
| input.send_keys(page) |
| submit.click() |
| wait.until( |
| EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page))) |
| wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item'))) |
| get_products() |
| except TimeoutException: |
| index_page(page) |
| |
| |
| def get_products(): |
| """ |
| 提取商品数据 |
| """ |
| html = browser.page_source |
| doc = pq(html) |
| items = doc('#mainsrp-itemlist .items .item').items() |
| for item in items: |
| product = { |
| 'image': item.find('.pic .img').attr('data-src'), |
| 'price': item.find('.price').text(), |
| 'deal': item.find('.deal-cnt').text(), |
| 'title': item.find('.title').text(), |
| 'shop': item.find('.shop').text(), |
| 'location': item.find('.location').text() |
| } |
| print(product) |
| save_to_mongo(product) |
| |
| |
| def save_to_mongo(result): |
| """ |
| 保存至MongoDB |
| :param result: 结果 |
| """ |
| try: |
| if db[MONGO_COLLECTION].insert(result): |
| print('存储到MongoDB成功') |
| except Exception: |
| print('存储到MongoDB失败') |
| |
| |
| def main(): |
| """ |
| 遍历每一页 |
| """ |
| for i in range(1, MAX_PAGE + 1): |
| index_page(i) |
| browser.close() |
| |
| |
| if __name__ == '__main__': |
| main() |
有注释版
复制 | from selenium import webdriver |
| from selenium.common.exceptions import TimeoutException |
| from selenium.webdriver.common.by import By |
| from selenium.webdriver.support import expected_conditions as EC |
| from selenium.webdriver.support.wait import WebDriverWait |
| from urllib.parse import quote |
| from pyquery import PyQuery |
| from pymongo import MongoClient |
| |
| browser = webdriver.Chrome() |
| wait = WebDriverWait(browser, 10) |
| KEYWORD = 'iPad' |
| |
| |
| def index_page(page): |
| """ |
| 抓取索引页 |
| :param page: 页码 |
| """ |
| print('正在爬取第', page, '页') |
| try: |
| url = 'https://s.taobao.com/search?q=' + quote(KEYWORD) |
| browser.get(url) |
| if page > 1: |
| input = wait.until( |
| EC.presence_of_element_located( |
| (By.CSS_SELECTOR, '#mainsrp-page div.form > input') |
| ) |
| ) |
| |
| |
| submit = wait.until( |
| EC.element_to_be_clickable( |
| (By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit') |
| ) |
| ) |
| |
| |
| input.clear() |
| input.send_keys(page) |
| submit.click() |
| wait.until( |
| EC.text_to_be_present_in_element( |
| (By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page) |
| ) |
| ) |
| |
| |
| wait.until( |
| EC.presence_of_element_located( |
| (By.CSS_SELECTOR, '.m-itemlist .items .item') |
| ) |
| ) |
| |
| |
| get_products() |
| except TimeoutException: |
| index_page(page) |
| |
| |
| def get_products(): |
| """ |
| 提取商品数据 |
| """ |
| html = browser.page_source |
| doc = PyQuery(html) |
| items = doc('#mainsrp-itemlist .items .item').items() |
| for item in items: |
| |
| product = { |
| 'image': item.find('.pic .J_ItemPic.img').attr('data-src'), |
| |
| 'price': item.find('.price.g_price.g_price-highlight').text(), |
| |
| 'deal': item.find('.deal-cnt').text(), |
| |
| 'title': item.find('row.row-2.title').text(), |
| |
| 'shop': item.find('.shop').text(), |
| |
| 'location': item.find('.location').text() |
| |
| } |
| print(product) |
| save_to_mongo(product) |
| |
| |
| MONGO_URL = 'localhost' |
| MONGO_DB = 'TaoBao' |
| MONGO_COLLECTION = 'products' |
| client = MongoClient(MONGO_URL) |
| db = client[MONGO_DB] |
| |
| |
| def save_to_mongo(result): |
| """ |
| 将爬取结果保存到MongoDB |
| :param result: 结果 |
| :return: |
| """ |
| try: |
| if db[MONGO_COLLECTION].insert(result): |
| print('存储到 MongoDB 成功') |
| except Exception: |
| print('存储到 MongoDB 失败') |
| |
| |
| MAX_PAGE = 100 |
| |
| |
| def main(): |
| """ |
| 遍历每一页 |
| :return: |
| """ |
| for i in range(1, MAX_PAGE + 1): |
| index_page(i) |
| browser.close() |
| |
| |
| if __name__ == '__main__': |
| main() |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· winform 绘制太阳,地球,月球 运作规律
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· 上周热点回顾(3.3-3.9)