第四次作业

业内容

作业①:
要求:
熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。
候选网站:东方财富网:http://quote.eastmoney.com/center/gridlist.html#hs_a_board
输出信息:MYSQL数据库存储和输出格式如下,表头应是英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计表头:
Gitee文件夹链接 https://gitee.com/see-no-green-hills/data-acquisition/tree/master/实验4/1
序号

import sqlite3
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import time
import logging
import sys
import io
import os

# 设置 Python 输出为 UTF-8
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler("stock_scraper.log", encoding='utf-8')  # 输出到文件
    ]
)

class StockScraper:
    def __init__(self, driver_path, db_path='stock_data.db'):
        self.driver_path = driver_path
        self.db_path = os.path.abspath(db_path)  # 使用绝对路径
        self.driver = self.setup_driver()
        self.board_xpaths = {
            "沪深A股": '//*[@id="nav_hs_a_board"]',
            "上证A股": '//*[@id="nav_sh_a_board"]',
            "深证A股": '//*[@id="nav_sz_a_board"]'
        }
        self.initialize_database()  # 初始化数据库

    def setup_driver(self):
        service = Service(executable_path=self.driver_path)
        options = webdriver.ChromeOptions()
        options.add_argument("--start-maximized")
        # 可选:无头模式
        # options.add_argument("--headless")
        driver = webdriver.Chrome(service=service, options=options)
        logging.info("WebDriver 启动成功。")
        return driver

    def initialize_database(self):
        """初始化数据库和表"""
        try:
            connection = sqlite3.connect(self.db_path)
            cursor = connection.cursor()
            cursor.execute("""
                CREATE TABLE IF NOT EXISTS stock_info (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    stock_code TEXT,
                    stock_name TEXT,
                    latest_price REAL,
                    change_percentage TEXT,
                    change_amount REAL,
                    volume TEXT,
                    turnover TEXT,
                    amplitude REAL,
                    highest_price REAL,
                    lowest_price REAL,
                    opening_price REAL,
                    previous_close REAL
                )
            """)
            connection.commit()
            logging.info(f"数据库已初始化,数据库文件位于: {self.db_path}")
        except sqlite3.Error as e:
            logging.error(f"数据库初始化失败: {e}")
            raise
        finally:
            cursor.close()
            connection.close()

    def navigate_to_website(self, url):
        self.driver.get(url)
        logging.info(f"已打开网站: {url}")
        try:
            WebDriverWait(self.driver, 20).until(
                EC.presence_of_element_located((By.ID, "table_wrapper-table"))
            )
            logging.info("页面主要元素加载完成。")
        except Exception as e:
            logging.error(f"页面加载失败: {e}")
            self.driver.quit()
            raise

    def clean_data(self, raw_text, field, code):
        if raw_text in ['-', '--']:
            return None
        cleaned = re.sub(r"[^\d\.-]", "", raw_text)
        try:
            return float(cleaned)
        except ValueError:
            logging.warning(f"解析失败 - 股票代码: {code}, 字段: {field}, 原始值: {raw_text}")
            return None

    def extract_stock_data(self):
        rows = self.driver.find_elements(By.XPATH, '//*[@id="table_wrapper-table"]/tbody/tr')
        stocks = []
        for idx, row in enumerate(rows, start=1):
            try:
                cells = row.find_elements(By.TAG_NAME, "td")
                stock_code = cells[1].text.strip()
                stock_name = cells[2].text.strip()
                latest_price = self.clean_data(cells[4].text.strip(), "最新报价", stock_code)
                change_percentage = cells[5].text.strip()
                change_amount = self.clean_data(cells[6].text.strip(), "涨跌额", stock_code)
                volume = cells[7].text.strip()
                turnover = cells[8].text.strip()
                amplitude = self.clean_data(cells[9].text.strip(), "振幅", stock_code)
                highest_price = self.clean_data(cells[10].text.strip(), "最高", stock_code)
                lowest_price = self.clean_data(cells[11].text.strip(), "最低", stock_code)
                opening_price = self.clean_data(cells[12].text.strip(), "今开", stock_code)
                previous_close = self.clean_data(cells[13].text.strip(), "昨收", stock_code)

                stock = (
                    stock_code, stock_name, latest_price, change_percentage, change_amount,
                    volume, turnover, amplitude, highest_price, lowest_price, opening_price, previous_close
                )
                stocks.append(stock)
            except Exception as e:
                logging.error(f"解析第 {idx} 行数据失败: {e}")
        logging.info(f"当前页面共提取到 {len(stocks)} 条股票数据。")
        return stocks

    def switch_board_and_scrape(self, board_name, xpath):
        try:
            board_element = WebDriverWait(self.driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, xpath))
            )
            board_element.click()
            logging.info(f"已切换到板块: {board_name}")
            time.sleep(3)  # 等待数据加载
            return self.extract_stock_data()
        except Exception as e:
            logging.error(f"切换到板块 {board_name} 失败: {e}")
            return []

    def scrape_all_boards(self):
        all_data = []
        for board, xpath in self.board_xpaths.items():
            logging.info(f"开始抓取板块: {board}")
            board_data = self.switch_board_and_scrape(board, xpath)
            all_data.extend(board_data)
            logging.info(f"板块 {board} 抓取完成,共 {len(board_data)} 条数据。")
        return all_data

    def save_to_database(self, data):
        connection = sqlite3.connect(self.db_path)
        cursor = connection.cursor()
        insert_query = """
            INSERT INTO stock_info (
                stock_code, stock_name, latest_price, change_percentage, change_amount,
                volume, turnover, amplitude, highest_price, lowest_price, opening_price, previous_close
            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """
        try:
            cursor.executemany(insert_query, data)
            connection.commit()
            logging.info(f"成功插入 {cursor.rowcount} 条数据到数据库。")
        except sqlite3.Error as e:
            logging.error(f"数据库插入失败: {e}")
            connection.rollback()
        finally:
            cursor.close()
            connection.close()
            logging.info("数据库连接已关闭。")

    def verify_data_insertion(self):
        """验证数据库中是否有数据"""
        try:
            connection = sqlite3.connect(self.db_path)
            cursor = connection.cursor()
            cursor.execute("SELECT COUNT(*) FROM stock_info")
            count = cursor.fetchone()[0]
            logging.info(f"数据库中共有 {count} 条记录。")
        except sqlite3.Error as e:
            logging.error(f"数据库查询失败: {e}")
        finally:
            cursor.close()
            connection.close()

    def close(self):
        self.driver.quit()
        logging.info("WebDriver 已关闭。")

def main():
    DRIVER_PATH = r"D:\\ChromeDriver\\chromedriver-win64\\chromedriver.exe"  # 替换为实际路径
    TARGET_URL = "http://quote.eastmoney.com/center/gridlist.html#hs_a_board"

    scraper = StockScraper(driver_path=DRIVER_PATH)
    try:
        scraper.navigate_to_website(TARGET_URL)
        all_stock_data = scraper.scrape_all_boards()
        scraper.save_to_database(all_stock_data)
        scraper.verify_data_insertion()
        
        # 打印前5条数据进行验证
        logging.info(f"总共抓取到 {len(all_stock_data)} 条数据。")
        for stock in all_stock_data[:5]:
            logging.info(stock)
    finally:
        scraper.close()

if __name__ == "__main__":
    # Windows 下设置代码页为 UTF-8
    if os.name == 'nt':
        os.system('chcp 65001')
    main()


作业②:
要求:
熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架+MySQL爬取中国mooc网课程资源信息(课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)
候选网站:中国mooc网:https://www.icourse163.org
输出信息:MYSQL数据库存储和输出格式
Gitee文件夹链接 https://gitee.com/see-no-green-hills/data-acquisition/tree/master/实验4/2

import time
import random
import sqlite3
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 设置Chrome浏览器驱动路径
driver_path = "D:\\ChromeDriver\\chromedriver-win64\\chromedriver.exe"
options = Options()
options.add_argument("--start-maximized")  # 启动时最大化窗口
options.add_argument("--disable-gpu")  # 禁用GPU加速
service = Service(executable_path=driver_path)
driver = webdriver.Chrome(service=service, options=options)



driver.get('https://www.icourse163.org/channel/2001.htm')

# 等待页面加载完成,查找课程列表的元素
WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[4]/div[2]/div/div/div/div[2]/div[1]/div/div[1]')))

# 模拟等待,确保页面完全加载
time.sleep(random.uniform(2, 5))

# 打开数据库连接(如果没有数据库则创建)
conn = sqlite3.connect('course_data.db')
cursor = conn.cursor()

# 创建数据库表格(如果没有表格则创建)
cursor.execute('''
CREATE TABLE IF NOT EXISTS courses (
    course_name TEXT,
    school_name TEXT,
    teacher_name TEXT,
    participants TEXT,
    course_progress TEXT
)
''')

# 获取课程列表中的所有课程元素
course_data = []

# 假设每一页有20个课程,遍历每个课程
for i in range(1, 21):  # 这里以 20 为示例,假设每页有 20 个课程
    try:
        # 根据你提供的 XPath 获取每个课程的信息
        course_xpath = f'/html/body/div[4]/div[2]/div/div/div/div[2]/div[1]/div/div[2]/div/div/div/div/div[1]/ul/div[{i}]'
        
        # 滚动页面到当前课程的位置(确保元素可见)
        course_element = driver.find_element(By.XPATH, course_xpath)
        driver.execute_script("arguments[0].scrollIntoView();", course_element)
        time.sleep(random.uniform(1, 2))  # 等待滚动完成

        # 获取课程名称
        course_name_xpath = f'{course_xpath}/div/div/div[3]/div[1]/h3'
        course_name = course_element.find_element(By.XPATH, course_name_xpath).text
        
        # 获取学校名称
        school_xpath = f'{course_xpath}/div/div/div[3]/div[1]/p'
        school_name = course_element.find_element(By.XPATH, school_xpath).text
        
        # 获取主讲人名称
        teacher_xpath = f'{course_xpath}/div/div/div[3]/div[1]/div'
        teacher_name = course_element.find_element(By.XPATH, teacher_xpath).text
        
        # 获取参与人数
        participants_xpath = f'{course_xpath}/div/div/div[3]/div[2]'
        participants = course_element.find_element(By.XPATH, participants_xpath).text
        
        # 获取课程进度
        progress_xpath = f'{course_xpath}/div/div/div[3]/div[2]/div'
        course_progress = course_element.find_element(By.XPATH, progress_xpath).text
        
        # 将抓取到的数据存入 course_data 列表
        course_data.append({
            "course_name": course_name,
            "school_name": school_name,
            "teacher_name": teacher_name,
            "participants": participants,
            "course_progress": course_progress
        })
        
        # 将数据存入数据库
        cursor.execute('''
        INSERT INTO courses (course_name, school_name, teacher_name, participants, course_progress)
        VALUES (?, ?, ?, ?, ?)
        ''', (course_name, school_name, teacher_name, participants, course_progress))
        conn.commit()

    except Exception as e:
        print(f"Error extracting course {i}: {e}")
        continue

# 打印所有课程数据
for data in course_data:
    print(data)

# 关闭数据库连接
conn.close()

# 关闭浏览器
driver.quit()

作业③:
要求:
掌握大数据相关服务,熟悉Xshell的使用
完成文档 华为云_大数据实时分析处理实验手册-Flume日志采集实验(部分)v2.docx 中的任务,即为下面5个任务,具体操作见文档。
环境搭建:
任务一:开通MapReduce服务
实时分析开发实战:
任务一:Python脚本生成测试数据
任务二:配置Kafka
任务三: 安装Flume客户端
任务四:配置Flume采集数据
输出:实验关键步骤或结果截图。


posted on 2024-11-26 05:14  山间游  阅读(10)  评论(0编辑  收藏  举报