大麦网演唱会信息爬取

main.py

from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery
import pymongo
from config import *
import re

options = webdriver.ChromeOptions()
options.add_argument('--headless')
browser = webdriver.Chrome(options=options)
wait = WebDriverWait(browser, 10)

client = pymongo.MongoClient(MONGO_URL, MONGO_PORT)
db = client[MONGO_DB]

def search_page():
    try:
        browser.get("https://www.damai.cn/")
        input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.dm-header-wrap > div > div.search-header > input")))
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "body > div.dm-header-wrap > div > div.search-header > div.btn-search")))
        input.send_keys("演唱会")
        submit.click()
        total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.search-box > div.search-box-top > span.search-box-keyword")))
        print ("共找到" + total.text + "个结果")
        get_products()
        return True
    except TimeoutError:
        search_page()

def next_page(index):
    try:
        page_css_id = "body > div.search-box > div.search-box-flex > div.search-main > div.search__itemlist > div.pagination > div > ul > li:nth-child(" + str(index)+")"
        switch_page = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, page_css_id)))
        switch_page.click()
        now_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.search-box > div.search-box-flex > div.search-main > div.search-sort.search-main-sort > div.pagination-top.search-sort_fr > div > span:nth-child(1)")))
        if str(index) == now_page.text:
            print("切换到第" + str(index) + "页")
            get_products()
        else:
            next_page(index)
    except TimeoutError:
        next_page(index)

def get_products():
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.search-box > div.search-box-flex > div.search-main > div.search__itemlist > div.item__main > div")))
    html = browser.page_source
    doc = PyQuery(html)
    items = doc('body > div.search-box > div.search-box-flex > div.search-main > div.search__itemlist > div.item__main > div')
    for item in items.children().items():
        course_name = item.find("div > div.items__txt__title > a").text()
        foo = item.find("div > div:nth-child(2)").text()
        if "艺人:" in foo:
            people_name = re.sub("艺人:", "", foo)
            address = ""
        else:
            address = foo
            people_name = ""
        if not address:
            address = item.find("div > div:nth-child(3)").text()
            course_date = item.find("div > div:nth-child(4)").text()
        else:
            course_date = item.find("div > div:nth-child(3)").text()
        product = {
            "演唱会名字:": course_name,
            "乐队名字:": people_name,
            "演唱地点": address,
            "演唱日期": course_date
        }
        print (product)
        save_to_mongo(product)

def save_to_mongo(result):
    if db[MONGO_TABLE].insert_one(result):
        print("存储到数据库", result)
    else:
        print("存储数据出错", result)

def main():
    search_page()
    for num in range(2, 6):
        next_page(num)
        sleep(2)

if __name__ == "__main__":
    main()

settings.py

MONGO_URL = 'localhost'
MONGO_PORT = 27017
MONGO_DB = 'damai'
MONGO_TABLE = 'yanchanghui'

SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
posted @   z5onk0  阅读(292)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?
· Pantheons:用 TypeScript 打造主流大模型对话的一站式集成库
点击右上角即可分享
微信分享提示