数据采集与融合技术实践作业四
Gitee仓库:https://gitee.com/wang-zi-lian20031002/crawl_project
一、熟练掌握 Selenium 查找 HTML 元素、爬取 Ajax 网页数据、等待 HTML 元素等内 容。使用 Selenium 框架+ MySQL 数据库存储技术路线爬取“沪深 A 股”、“上证 A 股”、 “深证 A 股”3 个板块的股票数据信息。
1、核心代码与效果图展示
- 核心代码
def __init__(self):
# 初始化日志记录器
logging.basicConfig(filename='spider.log', level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')
self.logger = logging.getLogger(__name__)
def startUp(self, url, key):
# 初始化Chrome浏览器
chrome_options = Options()
chrome_options.add_argument('--headless') # 设置无头模式,不弹出浏览器窗口
chrome_options.add_argument('--disable-gpu')
self.driver = webdriver.Chrome(options=chrome_options)
self.driver.get(url)
# 初始化变量
self.bankuai = ["nav_hs_a_board", "nav_sh_a_board", "nav_sz_a_board"]
self.bankuai_id = 0 # 当前板块
def processSpider(self, max_page=10):
page = 1
processed_codes = set() # 已处理的股票代码集合
while True:
time.sleep(1)
self.logger.info(f"Processing page {page} of data...")
trs = self.driver.find_elements(By.XPATH, "//table[@class='table_wrapper-table']/tbody/tr")
if not trs:
self.logger.warning('No data found on current page.')
break
data = []
for tr in trs:
try:
daima = tr.find_elements(By.XPATH, "./td")[1].text
if daima in processed_codes:
continue # 跳过已处理的股票代码
number = tr.find_elements(By.XPATH, "./td")[0].text
name = tr.find_elements(By.XPATH, "./td")[2].text
new = tr.find_elements(By.XPATH, "./td")[4].text
zangfu = tr.find_elements(By.XPATH, "./td")[5].text
e = tr.find_elements(By.XPATH, "./td")[6].text
chengjiao = tr.find_elements(By.XPATH, "./td")[7].text
jiaoe = tr.find_elements(By.XPATH, "./td")[8].text
zhenfu = tr.find_elements(By.XPATH, "./td")[9].text
max = tr.find_elements(By.XPATH, "./td")[10].text
min = tr.find_elements(By.XPATH, "./td")[11].text
today = tr.find_elements(By.XPATH, "./td")[12].text
ye = tr.find_elements(By.XPATH, "./td")[13].text
data.append((number, daima, name, new, zangfu, e, chengjiao, jiaoe, zhenfu, max, min, today, ye))
processed_codes.add(daima)
except Exception as err:
self.logger.error(f"Data parsing error: {err}")
try:
if data:
self.insertDB(data)
self.logger.info('Data inserted successfully.')
else:
self.logger.warning('No new data to be inserted.')
except Exception as err:
self.logger.error(f"Insert error: {err}")
if page >= max_page:
break
try:
self.bankuai_id += 1
next = self.driver.find_element(By.XPATH, "//li[@id='" + self.bankuai[self.bankuai_id % 3] + "']/a")
self.driver.execute_script("arguments[0].click();", next)
self.logger.info('Switched to next page successfully.')
time.sleep(5)
except Exception as err:
self.logger.error(f"Switch page error: {err}, exit spider.")
break
page += 1
- 效果图展示
2、心得体会
Selenium是一个非常强大的工具,它允许我们通过模拟用户操作来抓取动态网页数据。在这个项目中,我学习了如何设置Chrome的无头模式,这样可以在不打开浏览器窗口的情况下进行网页抓取,大大节省了资源。此外,我还学会了如何通过XPath定位网页元素,提取所需的数据。
二、熟练掌握 Selenium 查找 HTML 元素、实现用户模拟登录、爬取 Ajax 网页数据、 等待 HTML 元素等内容。使用 Selenium 框架+MySQL 爬取中国 mooc 网课程资源信息(课程号、课程名 称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)
1、核心代码与效果图展示
- 核心代码
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})
browser = webdriver.Chrome(options=options)
url = "https://www.icourse163.org/search.htm?search=%E5%A4%A7%E6%95%B0%E6%8D%AE#/"
# 声明一个list,存储dict
data_list = []
def create_table_if_not_exists(conn):
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS `mooc` (
`id` INT AUTO_INCREMENT PRIMARY KEY,
`course` VARCHAR(255) NOT NULL,
`college` VARCHAR(255) NOT NULL,
`teacher` VARCHAR(255) NOT NULL,
`team` VARCHAR(255) DEFAULT 'none',
`count` VARCHAR(50) NOT NULL,
`process` VARCHAR(50) NOT NULL,
`brief` TEXT
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
""")
conn.commit()
def start_spider():
# 请求url
browser.get(url)
# 显示等待商品信息加载完成
WebDriverWait(browser, 1000).until(
EC.presence_of_all_elements_located(
(By.ID, "j-courseCardListBox")
)
)
# 将滚动条拉到最下面的位置,因为往下拉才能将这一页的商品信息全部加载出来
browser.execute_script('document.documentElement.scrollTop=10000')
# 随机延迟,等下元素全部刷新
time.sleep(random.randint(3, 6))
browser.execute_script('document.documentElement.scrollTop=0')
# 开始提取信息,找到ul标签下的全部li标签
count = 0
for link in browser.find_elements(By.XPATH,'//div[@class="u-clist f-bgw f-cb f-pr j-href ga-click"]'):
count += 1
# 课程名称
course_name = link.find_element(By.XPATH,'.//span[@class=" u-course-name f-thide"]').text
school_name = link.find_element(By.XPATH,'.//a[@class="t21 f-fc9"]').text
# 主讲教师
m_teacher = link.find_element(By.XPATH,'.//a[@class="f-fc9"]').text
# 团队成员
try:
team_member = link.find_element(By.XPATH,'.//span[@class="f-fc9"]').text
except Exception as err:
team_member = 'none'
# 参加人数
join = link.find_element(By.XPATH,'.//span[@class="hot"]').text
join = join.replace('参加', '').strip() # 移除文本中的“参加”字样,并去除首尾空格
# 课程进度
process = link.find_element(By.XPATH,'.//span[@class="txt"]').text
# 课程简介
introduction = link.find_element(By.XPATH,'.//span[@class="p5 brief f-ib f-f0 f-cb"]').text
# 创建数据库连接
conn = pymysql.connect(host="localhost", port=3306, user="root", passwd="123456", db="Data_acquisition", charset="utf8")
# 创建表(如果还不存在的话)
create_table_if_not_exists(conn)
# 获取游标
cursor = conn.cursor()
# 插入数据
try:
cursor.execute(
"INSERT INTO mooc (`id`,`course`,`college`,`teacher`,`team`,`count`,`process`,`brief`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)",
(str(count), course_name, school_name, m_teacher, team_member, join, process, introduction))
except Exception as err:
print("error is ")
print(err)
finally:
# 提交事务
conn.commit()
# 关闭连接
conn.close()
- 效果图展示
2、心得体会
掌握了Selenium的基本使用,了解了如何处理动态加载的网页内容。同时,加深了对MySQL数据库操作的理解,学会了如何通过Python脚本创建和管理数据库表。