从0开始学爬虫8使用requests/pymysql和beautifulsoup4爬取维基百科词条链接并存入数据库

从0开始学爬虫8使用requests和beautifulsoup4爬取维基百科词条链接并存入数据库

Python使用requests和beautifulsoup4爬取维基百科词条链接并存入数据库

参考文档：

https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/

# 安装 beautifulsoup4

(pytools) D:\python\pytools>pip install beautifulsoup4

安装mysql的模块

pymysql的地址：https://github.com/PyMySQL/PyMySQL

爬取维基百科词条

# coding=utf-8

from bs4 import BeautifulSoup
import requests
import re


def spider_wike():
    url = "https://en.wikipedia.org/wiki/Main_Page"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
    resp = requests.get(url, headers = headers)
    # 将响应数据转换为utf-8编码
    resp.encoding = 'utf-8'

    html_doc = resp.text

    soup = BeautifulSoup(html_doc, "html.parser")
    # 找到以wiki开头的a标签的href属性
    list_urls = soup.find_all("a", href=re.compile("^/wiki/"))
    # print(list_urls)

    # 输出所有的词条对应的名称和URL
    for url in list_urls:
        # 过滤掉.jpg 或.JPG 结尾的URL
        if not re.search(r"\.(jpg|JPG)", url["href"]):
            # 词条加网址
            # sting只能获取一个， get_text() 可以获取标签下所有的内容
            print(url.get_text(), " <------>", "https://en.wikipedia.org" + url["href"])


if __name__ == '__main__':
    spider_wike()

# 将维基百科词条链接存入数据库

# coding=utf-8

from bs4 import BeautifulSoup
import requests
import re
import pymysql.cursors


''' 
    # 环境准备
    pip install pymysql
    create database wikiurl charset=utf8mb4;
    use wikiurl;
    create table urls (id int primary key auto_increment,urlname varchar(255),urlhref varchar(1000));
'''
url = "https://en.wikipedia.org/wiki/Main_Page"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
resp = requests.get(url, headers = headers)
# 将响应数据转换为utf-8编码
resp.encoding = 'utf-8'

html_doc = resp.text

soup = BeautifulSoup(html_doc, "html.parser")
# 找到以wiki开头的a标签的href属性
list_urls = soup.find_all("a", href=re.compile("^/wiki/"))
# print(list_urls)

# 输出所有的词条对应的名称和URL
for url in list_urls:
    # 过滤掉.jpg 或.JPG 结尾的URL
    if not re.search(r"\.(jpg|JPG)", url["href"]):
        # 词条加网址
        # sting只能获取一个， get_text() 可以获取标签下所有的内容
        print(url.get_text(), " <------>", "https://en.wikipedia.org" + url["href"])

        connection = pymysql.connect(host='localhost',
                                     user='root',
                                     password='root',
                                     db='wikiurl',
                                     charset='utf8mb4')
        try:
            # 获取回话指针
            with connection.cursor() as cursor:
                # 创建sql语句
                sql = "insert into `urls`(`urlname`,`urlhref`) values(%s,%s)"

                # 执行sql语句
                cursor.execute(sql,(url.get_text(), "https://en.wikipedia.org" + url["href"]))
                # 提交数据
                connection.commit()
        finally:
            connection.close()

# 从数据库读取词条信息

# coding=utf-8

import pymysql


def get_conn():
    connection = pymysql.connect(host='localhost',
                                 user='root',
                                 password='root',
                                 db='wikiurl',
                                 charset='utf8mb4')
    return connection


def get_wiki_data():
    conn = get_conn()

    sql = "select `urlname`,`urlhref` from urls"
    cur = conn.cursor()
    # 获取总记录条数
    count = cur.execute(sql)
    print(count)


    # 获取所有数据
    # urllists = cur.fetchall()
    # 获取指定条目数据
    # urllists = cur.fetchmany(3)
    #
    # for url in urllists:
    #     print(url[0],'<--->',url[1])

    # 获取一条数据
    link = cur.fetchone()
    print(link)

    # 关闭数据库连接
    conn.close()


def get_data():
    conn = get_conn()

    try:
        with conn.cursor() as cur:
            sql = "select `urlname`,`urlhref` from urls where `id` is not NULL"
            count = cur.execute(sql)
            print(count)

            # 查询所有数据
            # data = cur.fetchall()
            # print(data)

            # 查询指定条目数据
            result = cur.fetchmany(size = 5)
            print(result)
    finally:
        conn.close()


if __name__ == '__main__':
    # get_wiki_data()
    get_data()

posted @ 2019-07-17 11:47 reblue520 阅读(746) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

阅读排行：
· 震惊！C++程序真的从main开始吗？99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码？零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾（3.3-3.9）
· winform 绘制太阳，地球，月球运作规律

历史上的今天：
2017-07-17 Zabbix监控nginx性能的另外一种方式

公告

昵称： reblue520
园龄： 8年2个月
粉丝： 120
关注： 0

+加关注

2025年3月

日

一

二

三

四

五

六

reblue520

专注个人成长

从0开始学爬虫8使用requests/pymysql和beautifulsoup4爬取维基百科词条链接并存入数据库

公告

搜索

常用链接

我的标签

随笔分类

随笔档案

阅读排行榜

评论排行榜

推荐排行榜

最新评论