从0开始学爬虫8使用requests/pymysql和beautifulsoup4爬取维基百科词条链接并存入数据库

从0开始学爬虫8使用requests和beautifulsoup4爬取维基百科词条链接并存入数据库

Python使用requests和beautifulsoup4爬取维基百科词条链接并存入数据库

参考文档:

https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/

# 安装 beautifulsoup4

(pytools) D:\python\pytools>pip install beautifulsoup4

安装mysql的模块

pymysql的地址:https://github.com/PyMySQL/PyMySQL

 

爬取维基百科词条

复制代码
# coding=utf-8

from bs4 import BeautifulSoup
import requests
import re


def spider_wike():
    url = "https://en.wikipedia.org/wiki/Main_Page"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
    resp = requests.get(url, headers = headers)
    # 将响应数据转换为utf-8编码
    resp.encoding = 'utf-8'

    html_doc = resp.text

    soup = BeautifulSoup(html_doc, "html.parser")
    # 找到以wiki开头的a标签的href属性
    list_urls = soup.find_all("a", href=re.compile("^/wiki/"))
    # print(list_urls)

    # 输出所有的词条对应的名称和URL
    for url in list_urls:
        # 过滤掉.jpg 或.JPG 结尾的URL
        if not re.search(r"\.(jpg|JPG)", url["href"]):
            # 词条加网址
            # sting只能获取一个, get_text() 可以获取标签下所有的内容
            print(url.get_text(), " <------>", "https://en.wikipedia.org" + url["href"])


if __name__ == '__main__':
    spider_wike()
复制代码

# 将维基百科词条链接存入数据库

复制代码
# coding=utf-8

from bs4 import BeautifulSoup
import requests
import re
import pymysql.cursors


''' 
    # 环境准备
    pip install pymysql
    create database wikiurl charset=utf8mb4;
    use wikiurl;
    create table urls (id int primary key auto_increment,urlname varchar(255),urlhref varchar(1000));
'''
url = "https://en.wikipedia.org/wiki/Main_Page"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
resp = requests.get(url, headers = headers)
# 将响应数据转换为utf-8编码
resp.encoding = 'utf-8'

html_doc = resp.text

soup = BeautifulSoup(html_doc, "html.parser")
# 找到以wiki开头的a标签的href属性
list_urls = soup.find_all("a", href=re.compile("^/wiki/"))
# print(list_urls)

# 输出所有的词条对应的名称和URL
for url in list_urls:
    # 过滤掉.jpg 或.JPG 结尾的URL
    if not re.search(r"\.(jpg|JPG)", url["href"]):
        # 词条加网址
        # sting只能获取一个, get_text() 可以获取标签下所有的内容
        print(url.get_text(), " <------>", "https://en.wikipedia.org" + url["href"])

        connection = pymysql.connect(host='localhost',
                                     user='root',
                                     password='root',
                                     db='wikiurl',
                                     charset='utf8mb4')
        try:
            # 获取回话指针
            with connection.cursor() as cursor:
                # 创建sql语句
                sql = "insert into `urls`(`urlname`,`urlhref`) values(%s,%s)"

                # 执行sql语句
                cursor.execute(sql,(url.get_text(), "https://en.wikipedia.org" + url["href"]))
                # 提交数据
                connection.commit()
        finally:
            connection.close()
复制代码

 

# 从数据库读取词条信息

复制代码
# coding=utf-8

import pymysql


def get_conn():
    connection = pymysql.connect(host='localhost',
                                 user='root',
                                 password='root',
                                 db='wikiurl',
                                 charset='utf8mb4')
    return connection


def get_wiki_data():
    conn = get_conn()

    sql = "select `urlname`,`urlhref` from urls"
    cur = conn.cursor()
    # 获取总记录条数
    count = cur.execute(sql)
    print(count)


    # 获取所有数据
    # urllists = cur.fetchall()
    # 获取指定条目数据
    # urllists = cur.fetchmany(3)
    #
    # for url in urllists:
    #     print(url[0],'<--->',url[1])

    # 获取一条数据
    link = cur.fetchone()
    print(link)

    # 关闭数据库连接
    conn.close()


def get_data():
    conn = get_conn()

    try:
        with conn.cursor() as cur:
            sql = "select `urlname`,`urlhref` from urls where `id` is not NULL"
            count = cur.execute(sql)
            print(count)

            # 查询所有数据
            # data = cur.fetchall()
            # print(data)

            # 查询指定条目数据
            result = cur.fetchmany(size = 5)
            print(result)
    finally:
        conn.close()


if __name__ == '__main__':
    # get_wiki_data()
    get_data()
复制代码

 

posted @   reblue520  阅读(746)  评论(0编辑  收藏  举报
编辑推荐:
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律
历史上的今天:
2017-07-17 Zabbix监控nginx性能的另外一种方式
点击右上角即可分享
微信分享提示