大麦网演唱会信息爬取
main.py
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery
import pymongo
from config import *
import re
options = webdriver.ChromeOptions()
options.add_argument('--headless')
browser = webdriver.Chrome(options=options)
wait = WebDriverWait(browser, 10)
client = pymongo.MongoClient(MONGO_URL, MONGO_PORT)
db = client[MONGO_DB]
def search_page():
try:
browser.get("https://www.damai.cn/")
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.dm-header-wrap > div > div.search-header > input")))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "body > div.dm-header-wrap > div > div.search-header > div.btn-search")))
input.send_keys("演唱会")
submit.click()
total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.search-box > div.search-box-top > span.search-box-keyword")))
print ("共找到" + total.text + "个结果")
get_products()
return True
except TimeoutError:
search_page()
def next_page(index):
try:
page_css_id = "body > div.search-box > div.search-box-flex > div.search-main > div.search__itemlist > div.pagination > div > ul > li:nth-child(" + str(index)+")"
switch_page = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, page_css_id)))
switch_page.click()
now_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.search-box > div.search-box-flex > div.search-main > div.search-sort.search-main-sort > div.pagination-top.search-sort_fr > div > span:nth-child(1)")))
if str(index) == now_page.text:
print("切换到第" + str(index) + "页")
get_products()
else:
next_page(index)
except TimeoutError:
next_page(index)
def get_products():
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.search-box > div.search-box-flex > div.search-main > div.search__itemlist > div.item__main > div")))
html = browser.page_source
doc = PyQuery(html)
items = doc('body > div.search-box > div.search-box-flex > div.search-main > div.search__itemlist > div.item__main > div')
for item in items.children().items():
course_name = item.find("div > div.items__txt__title > a").text()
foo = item.find("div > div:nth-child(2)").text()
if "艺人:" in foo:
people_name = re.sub("艺人:", "", foo)
address = ""
else:
address = foo
people_name = ""
if not address:
address = item.find("div > div:nth-child(3)").text()
course_date = item.find("div > div:nth-child(4)").text()
else:
course_date = item.find("div > div:nth-child(3)").text()
product = {
"演唱会名字:": course_name,
"乐队名字:": people_name,
"演唱地点": address,
"演唱日期": course_date
}
print (product)
save_to_mongo(product)
def save_to_mongo(result):
if db[MONGO_TABLE].insert_one(result):
print("存储到数据库", result)
else:
print("存储数据出错", result)
def main():
search_page()
for num in range(2, 6):
next_page(num)
sleep(2)
if __name__ == "__main__":
main()
settings.py
MONGO_URL = 'localhost'
MONGO_PORT = 27017
MONGO_DB = 'damai'
MONGO_TABLE = 'yanchanghui'
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?
· Pantheons:用 TypeScript 打造主流大模型对话的一站式集成库