import random
import time
from datetime import datetime
from pymongo import MongoClient
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
import selenium
import pyquery
from ConnectionPool import Client_Pool
from abc import ABC, abstractmethod
from cookies.CookieSaver import CookieSaver
class BaseCrawler(ABC):
@abstractmethod
def crawl(self, url: str):
pass
@abstractmethod
def parse(self):
pass
@abstractmethod
def save(self):
pass
class CrawlerData:
dataList: list[dict[str, any]]
dataDict: dict[str, any]
cssDict: dict[str, str]
def __init__(self, css_dict: dict[str, str]):
self.cssDict = css_dict
self.dataList = []
self.dataDict = {}
def write(self, col_name: str, value: any):
self.dataDict[col_name] = value
def css(self, name: str) -> str:
return self.cssDict.get(name, "")
def nextRow(self):
for col_name in self.cssDict.keys():
self.dataDict[col_name] = self.dataDict.get(col_name, None)
self.dataList.append(self.dataDict)
self.dataDict = {}
def getColName(self):
return tuple(self.cssDict.keys())
class Crawler(BaseCrawler):
driver: webdriver.Edge
data: CrawlerData
cookieSaver: CookieSaver
def crawl(self, url: str):
pass
def parse(self):
pass
def save(self):
with MongoClient() as client:
db = client["Hotel"]
collection = db["XIECHENG"]
tuples = self.data.dataList
for t in tuples:
collection.update_one(t, {"$set": t}, upsert=True)
def __init__(self, url_list: list[str], ):
options = Options()
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
options.add_argument(
'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0')
self.driver = webdriver.Edge(options=options)
self.urlList = url_list
def randomWait(min_time=0.5, max_time=1.5):
time.sleep(random.uniform(min_time, max_time))
class HotelCrawler(Crawler):
def __init__(self, url_list: list[str]):
self.goods_css = "div.card-item-wrap"
css_dict = {
"title": ".list-card-title span",
"location": "span.ads",
"price": "span.real-price",
"tags": "div.list-card-tag",
"comment": "div.list-card-comment p.count",
"score": "div.score span"
}
self.data = CrawlerData(css_dict)
super().__init__(url_list)
self.cookieSaver = CookieSaver(self.driver)
def randomScroll(self):
self.driver.execute_script("window.scrollBy(0,(Math.random()*0.3+0.7)*document.body.scrollHeight);")
def parse(self):
doc = pyquery.PyQuery(self.driver.page_source)
goods = doc(self.goods_css).items()
for g in goods:
for col, css in self.data.cssDict.items():
self.data.write(col, g(css).text())
self.data.write("domain", self.cookieSaver.cookies.domain)
self.data.write("time", datetime.now().date().isoformat())
self.data.nextRow()
def findMore(self):
try:
target = self.driver.find_element(By.CSS_SELECTOR, "div.list-btn-more div")
self.driver.execute_script("arguments[0].scrollIntoView({block: 'center',inline: 'center'});", target)
target.click()
return True
except Exception as e:
return False
def crawlAllURL(self, times=10):
for url in self.urlList:
self.crawl(url,times)
def crawl(self, url: str, times=10):
self.driver.get(url)
self.driver.get(url)
load = self.cookieSaver.load_cookies()
valid = self.cookieSaver.is_cookie_valid()
while not load or not valid:
input("请登录后按回车键继续...")
self.cookieSaver.save_cookies()
load = self.cookieSaver.load_cookies()
valid = self.cookieSaver.is_cookie_valid()
more_times = 0
try:
while True:
if self.findMore():
more_times += 1
if more_times > times:
break
else:
self.randomScroll()
randomWait(2.5, 3)
except Exception as e:
self.parse()
self.save()
print(f'遇到错误:{e}'
f'已经当前数据存储')
self.parse()
self.save()
if __name__ == '__main__':
urls = [
"https://hotels.ctrip.com/hotels/list?countryId=1&city=-1&optionId=16&optionType=Province&display=%E6%B5%99%E6%B1%9F%2C+%E4%B8%AD%E5%9B%BD",
]
crawler = HotelCrawler(urls)
crawler.crawlAllURL(100)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通