Title

一个携程的酒店爬虫代码

import random
import time
from datetime import datetime

from pymongo import MongoClient
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
import selenium
import pyquery
from ConnectionPool import Client_Pool
from abc import ABC, abstractmethod
from cookies.CookieSaver import CookieSaver


class BaseCrawler(ABC):
    @abstractmethod
    def crawl(self, url: str):
        pass

    @abstractmethod
    def parse(self):
        pass

    @abstractmethod
    def save(self):
        pass



class CrawlerData:
    dataList: list[dict[str, any]]
    dataDict: dict[str, any]
    cssDict: dict[str, str]

    def __init__(self, css_dict: dict[str, str]):
        self.cssDict = css_dict
        self.dataList = []
        self.dataDict = {}

    def write(self, col_name: str, value: any):
        self.dataDict[col_name] = value

    def css(self, name: str) -> str:
        return self.cssDict.get(name, "")

    def nextRow(self):
        for col_name in self.cssDict.keys():
            self.dataDict[col_name] = self.dataDict.get(col_name, None)
        self.dataList.append(self.dataDict)
        self.dataDict = {}

    def getColName(self):
        return tuple(self.cssDict.keys())


class Crawler(BaseCrawler):
    driver: webdriver.Edge
    data: CrawlerData
    cookieSaver: CookieSaver

    def crawl(self, url: str):
        pass

    def parse(self):
        pass

    def save(self):
        with MongoClient() as client:
            db = client["Hotel"]
            collection = db["XIECHENG"]
            tuples = self.data.dataList
            for t in tuples:
                collection.update_one(t, {"$set": t}, upsert=True)

    def __init__(self, url_list: list[str], ):
        # 设置反爬参数,防止被发现是爬虫
        options = Options()
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--ignore-ssl-errors')
        options.add_argument(
            'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0')
        self.driver = webdriver.Edge(options=options)
        self.urlList = url_list


def randomWait(min_time=0.5, max_time=1.5):
    time.sleep(random.uniform(min_time, max_time))


class HotelCrawler(Crawler):
    def __init__(self, url_list: list[str]):
        self.goods_css = "div.card-item-wrap"
        css_dict = {
            "title": ".list-card-title span",
            "location": "span.ads",
            "price": "span.real-price",
            "tags": "div.list-card-tag",
            "comment": "div.list-card-comment p.count",
            "score": "div.score span"
        }
        self.data = CrawlerData(css_dict)
        super().__init__(url_list)
        self.cookieSaver = CookieSaver(self.driver)

    def randomScroll(self):
        self.driver.execute_script("window.scrollBy(0,(Math.random()*0.3+0.7)*document.body.scrollHeight);")

    def parse(self):
        doc = pyquery.PyQuery(self.driver.page_source)
        goods = doc(self.goods_css).items()
        for g in goods:
            for col, css in self.data.cssDict.items():
                self.data.write(col, g(css).text())
            self.data.write("domain", self.cookieSaver.cookies.domain)
            self.data.write("time", datetime.now().date().isoformat())
            self.data.nextRow()

    def findMore(self):
        try:
            target = self.driver.find_element(By.CSS_SELECTOR, "div.list-btn-more div")
            self.driver.execute_script("arguments[0].scrollIntoView({block: 'center',inline: 'center'});", target)
            target.click()
            return True
        except Exception as e:
            return False

    def crawlAllURL(self, times=10):
        for url in self.urlList:
            self.crawl(url,times)

    def crawl(self, url: str, times=10):
        self.driver.get(url)
        self.driver.get(url)
        load = self.cookieSaver.load_cookies()
        valid = self.cookieSaver.is_cookie_valid()
        while not load or not valid:
            input("请登录后按回车键继续...")
            self.cookieSaver.save_cookies()
            load = self.cookieSaver.load_cookies()
            valid = self.cookieSaver.is_cookie_valid()
        more_times = 0
        try:
            while True:
                if self.findMore():
                    more_times += 1
                    if more_times > times:
                        break
                else:
                    self.randomScroll()
                randomWait(2.5, 3)
        except Exception as e:
            self.parse()
            self.save()
            print(f'遇到错误:{e}'
                  f'已经当前数据存储')
        self.parse()
        self.save()


if __name__ == '__main__':
    urls = [
        "https://hotels.ctrip.com/hotels/list?countryId=1&city=-1&optionId=16&optionType=Province&display=%E6%B5%99%E6%B1%9F%2C+%E4%B8%AD%E5%9B%BD",
    ]
    crawler = HotelCrawler(urls)
    crawler.crawlAllURL(100)

posted @   是胡某某啊  阅读(108)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通
点击右上角即可分享
微信分享提示