数据采集与融合技术实践作业4

102202143 梁锦盛

1.东方财富网信息爬取

使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。

一、作业代码与展示

1.编写代码文件

from selenium import webdriver
from selenium.webdriver.common.by import By
import mysql.connector
import time

# 设置 MySQL 连接
connection = mysql.connector.connect(
    host="localhost",
    user="root",
    password="2896685056Qq!",
    database="stock_data"
)
cursor = connection.cursor()

# 设置 Selenium 驱动
driver = webdriver.Chrome()

# 定义板块URL字典
board_urls = {
    "沪深A股": "http://quote.eastmoney.com/center/gridlist.html#hs_a_board",
    "上证A股": "http://quote.eastmoney.com/center/gridlist.html#sh_a_board",
    "深证A股": "http://quote.eastmoney.com/center/gridlist.html#sz_a_board"
}

# 遍历每个板块页面并提取数据
for board_name, url in board_urls.items():
    driver.get(url)
    time.sleep(3)  # 等待页面加载

    # 获取表格每一行的股票数据
    rows = driver.find_elements(By.XPATH, "//*[@id='table_wrapper-table']/tbody/tr")

    for row in rows:
        data = {
            "bStockNo": row.find_element(By.XPATH, "./td[2]/a").text,  # 股票代码
            "bStockName": row.find_element(By.XPATH, "./td[3]/a").text,  # 股票名称
            "latestPrice": row.find_element(By.XPATH, "./td[5]").text,  # 最新报价
            "changeRate": row.find_element(By.XPATH, "./td[6]").text,  # 涨跌幅
            "changeAmount": row.find_element(By.XPATH, "./td[7]").text,  # 涨跌额
            "volume": row.find_element(By.XPATH, "./td[8]").text,  # 成交量
            "turnover": row.find_element(By.XPATH, "./td[9]").text,  # 成交额
            "amplitude": row.find_element(By.XPATH, "./td[10]").text,  # 振幅
            "highPrice": row.find_element(By.XPATH, "./td[11]/span").text,  # 最高价
            "lowPrice": row.find_element(By.XPATH, "./td[12]").text,  # 最低价
            "openPrice": row.find_element(By.XPATH, "./td[13]").text,  # 今开
            "prevClose": row.find_element(By.XPATH, "./td[14]").text  # 昨收
        }

        # 移除百分号并转换为小数格式
        data["changeRate"] = float(data["changeRate"].replace('%', '')) / 100 if '%' in data["changeRate"] else float(
            data["changeRate"])
        data["amplitude"] = float(data["amplitude"].replace('%', '')) / 100 if '%' in data["amplitude"] else float(
            data["amplitude"])
        print(data)

        # 将数据插入 MySQL
        cursor.execute("""
            INSERT INTO stocks (bStockNo, bStockName, latestPrice, changeRate, changeAmount, volume, turnover, amplitude, highPrice, lowPrice, openPrice, prevClose, board)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        """, (data["bStockNo"], data["bStockName"], data["latestPrice"], data["changeRate"], data["changeAmount"],
              data["volume"], data["turnover"], data["amplitude"], data["highPrice"], data["lowPrice"],
              data["openPrice"], data["prevClose"], board_name))

# 提交事务
connection.commit()

# 关闭连接
cursor.close()
connection.close()
driver.quit()



2.运行结果


3.Gitee文件夹链接:https://gitee.com/liang-jinsheng-289668/project/tree/master/作业4

二、作业心得

完成这个作业,我懂得了任何使用selenium爬取不同页面的数据

2.mooc信息爬取

使用Selenium框架+MySQL爬取中国mooc网课程资源信息(课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)

一、作业代码与展示

1.编写代码

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymysql
import time

# 设置Chrome选项
options = webdriver.ChromeOptions()
options.add_argument('--disable-extensions')
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(options=options)

# MySQL连接配置
db = pymysql.connect(
    host='localhost',
    user='root',
    password='2896685056Qq!',
    database='MoocData',
    charset='utf8mb4'
)
cursor = db.cursor()

# 数据插入函数
def insert_data(course_info):
    sql = """INSERT INTO Courses (cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief)
             VALUES (%s, %s, %s, %s, %s, %s, %s)"""
    cursor.execute(sql, course_info)
    db.commit()
    print(f"Data inserted: {course_info}")

# 登录函数
def login():
    print("1")
    driver.get("https://www.icourse163.org")
    print("Accessing login page")

    # 点击登录按钮
    login_btn = driver.find_element(By.CLASS_NAME, "_3uWA6")
    login_btn.click()

    # 切换到登录框iframe
    iframe = WebDriverWait(driver, 15).until(
        EC.presence_of_element_located((By.TAG_NAME, "iframe"))
    )
    driver.switch_to.frame(iframe)

    # 输入用户名和密码
    driver.find_element(By.ID, "phoneipt").send_keys("13015721181")
    driver.find_element(By.XPATH,"//input[@type='password' and @name='email' and contains(@class, 'j-inputtext')]").send_keys("2896685056Qq!")  # 输入密码
    driver.find_element(By.ID, "submitBtn").click()

    # 切换回主内容
    driver.switch_to.default_content()
    print("Login successful")
    time.sleep(5)

# 主程序逻辑
def scrape_courses():
    driver.get("https://www.icourse163.org")
    print("Accessing course list page")

    # 向下滚动以加载更多的_3KiL7类元素
    scroll_pause_time = 3  # 每次滚动后等待的时间
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        try:
            while True:
                # 向下滚动页面
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(scroll_pause_time)

                # 计算新的页面高度并比较
                new_height = driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    break
                last_height = new_height

                # 等待新的_3KiL7类元素加载完成
                courses = WebDriverWait(driver, 15).until(
                    EC.presence_of_all_elements_located((By.CLASS_NAME, "_3KiL7"))
                )

                for course in courses:
                    try:
                        # 基本信息提取
                        cCourse = course.find_element(By.CLASS_NAME, "_3EwZv").text
                        print(cCourse)
                        cCollege = course.find_element(By.CLASS_NAME, "_2lZi3").text
                        print(cCollege)
                        cCount = int(course.find_element(By.CLASS_NAME, "_3DcLu").text.replace("人参加", ""))
                        print(cCount)

                        # 进入课程详情页
                        course.find_element(By.CLASS_NAME, "_3EwZv").click()
                        driver.switch_to.window(driver.window_handles[-1])

                        # 等待详情页加载
                        WebDriverWait(driver, 15).until(
                            EC.presence_of_element_located((By.CLASS_NAME, "f-richEditorText"))
                        )

                        # 详情信息提取
                        cBrief = driver.find_element(By.CLASS_NAME, "f-richEditorText").text
                        print(cBrief)
                        cProcess = driver.find_element(By.CLASS_NAME, "text").text
                        print(cProcess)
                        cTeacher = driver.find_element(By.CLASS_NAME, "f-fc3").text
                        print(cTeacher)
                        cTeam = ", ".join(
                            [t.text for t in driver.find_elements(By.CLASS_NAME, "cnt h3")]
                        )


                        # 数据插入
                        course_info = (cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief)
                        insert_data(course_info)

                        # 关闭详情页返回主页面
                        driver.close()
                        driver.switch_to.window(driver.window_handles[0])

                    except Exception as e:
                        print(f"Error processing course: {e}")
                        driver.switch_to.window(driver.window_handles[0])

                # 检查是否存在下一页
                if len(courses)==0:
                    break
                time.sleep(3)

        except Exception as e:
            print(f"Error on page: {e}")
            break

# 程序执行
try:
    login()
    scrape_courses()
finally:
    driver.quit()
    db.close()



2.运行结果


3.Gitee文件夹链接:https://gitee.com/liang-jinsheng-289668/project/tree/master/作业4

二、作业心得

我学会了如何使用selenium去逐个点击卡片进入不同页面并返回,并通过元素类名爬取不同的信息

3.Flume日志采集

完成文档 华为云_大数据实时分析处理实验手册-Flume日志采集实验(部分)v2.docx 中的任务

一、作业展示

任务一:Python脚本生成测试数据

任务二:配置Kafka

安装Kafka运行环境

安装Kafka客户端

设置环境变量,在kafka中创建topic

任务三: 安装Flume客户端

安装Flume运行环境

安装Flume到目录“/opt/FlumeClient”

任务四:配置Flume采集数据

二、作业心得

完成这个作业,我了解了鲲鹏架构特点和优势,学会在华为云创建管理服务器实例,掌握 Hadoop 在鲲鹏服务器上的部署,包括 HDFS 的安装配置启动,学会配置优化参数及规划资源,还学会使用云平台工具服务,如云硬盘、弹性 IP 等。认识到实时分析重要性,了解其流程架构,体会到与离线分析的不同,需根据业务场景选择合适方式。培养了数据分析思维能力,认识到离线分析在企业中的应用,意识到要不断学习新技术适应变化,提高技术水平和耐心毅力。

posted @ 2024-11-25 22:07  fzu_ljs  阅读(3)  评论(0编辑  收藏  举报