Python爬虫基础

爬虫：一段自动抓取互联网信息的程度，从互联网上抓取对于我们有价值的信息

　　Python 爬虫架构主要由五个部分组成，分别是调度器、URL管理器、网页下载器、网页解析器、应用程序（爬取的有价值数据）。
　　调度器：相当于一台电脑的CPU，主要负责调度URL管理器、下载器、解析器之间的协调工作。
　　URL管理器：包括待爬取的URL地址和已爬取的URL地址，防止重复抓取URL和循环抓取URL，实现URL管理器主要用三种方式，通过内存、数据库、缓存数据库来实现。
　　网页下载器：通过传入一个URL地址来下载网页，将网页转换成一个字符串，网页下载器有urllib2（Python官方基础模块）包括需要登录、代理、和cookie，requests(第三方包)
　　网页解析器：将一个网页字符串进行解析，可以按照我们的要求来提取出我们有用的信息，也可以根据DOM　　树的解析方式来解析。网页解析器有正则表达式（直观，将网页转成字符串通过模糊匹配的方式来提取有价值的信息，当文档比较复杂的时候，该方法提取数据的时候就会非常的困难）、html.parser（Python自带的）、beautifulsoup（第三方插件，可以使用Python自带的html.parser进行解析，也可以使用lxml进行解析，相对于其他几种来说要强大一些）、lxml（第三方插件，可以解析 xml 和 HTML），html.parser 和 beautifulsoup 以及 lxml 都是以 DOM 树的方式进行解析的。
　　应用程序：就是从网页中提取的有用数据组成的一个应用。

先安装beautifulsoup

Beautiful Soup: Python 的第三方插件用来提取 xml 和 HTML 中的数据，官网地址 https://www.crummy.com/software/BeautifulSoup/

pip install beautifulsoup4（在cmd命令提示符中执行此代码）

1.爬虫第一个入门程序

from bs4 import BeautifulSoup
import urllib.request

#定义URL
url = "http://www.baidu.com"

#访问url
response = urllib.request.urlopen(url)
#将结果存入字符串中
ret = response.read()
#获取响应状态码
print(response.getcode())

print(ret)
#创建一个BeautifulSoup的对象
soup = BeautifulSoup(ret,"html.parser",from_encoding="utf-8")
# #获取所有的a链接
# links = soup.find_all('a')
# #遍历每一个a链接
# for link in links:
#     print(link.name,link['href'],link.get_text())
p = soup.find_all('p')
for ps in p:
    print(ps.get_text())

2.1爬虫程序添加data

import urllib.parse
from urllib import request

#定义参数
values={"username":"","password":""}
#参数编码
data = urllib.parse.urlencode(values).encode(encoding="UTF8")
#定义URL
# url = "http://passport.csdn.net/login?code=applets";
url = "http://mail.qq.com/";
#构造request请求
req = request.Request(url,data=data)
#打开网页
resp = request.urlopen(req)
print(resp.read())

2.2爬虫程序添加header

import urllib
from urllib import request

url = "http://www.zhihu.com/signin?next=%2F"
# 请求头的内容
user_agent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
#表单的请求参数
valuse={'username':'','password':''}
data = urllib.parse.urlencode(valuse).encode(encoding='UTF8')
# 构造请求头headers
headers={'User-Agent':user_agent,'Refere':'http://www.zhihu.com/signin?next=%2F'}
# 构造请求
req = request.Request(url,data = data,headers = headers)
# 打开网页
resp = request.urlopen(req)
# 读取网页内容
print(resp.read())

2.3爬虫程序添加post请求

import urllib
from urllib import request

url = "http://www.zhihu.com/signin?next=%2F"
# 请求头的内容
user_agent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
#表单的请求参数
valuse={'username':'','password':''}
data = urllib.parse.urlencode(valuse).encode(encoding='UTF8')
# 构造请求头headers
headers={'User-Agent':user_agent,'Refere':'http://www.zhihu.com/signin?next=%2F'}
# 构造请求
req = request.Request(url,data = data,headers = headers)
# 打开网页
resp = request.urlopen(req)
# 读取网页内容
print(resp.read())

3.爬虫程序添加cookie

from http import  cookiejar
from urllib import request
#设置保存cookie的文件，同级目录下的cookie.txt
filename = 'cookie.txt'
#声明一个MozillaCookieJar对象实例来保存cookie到文件
cookie = cookiejar.MozillaCookieJar(filename)
#利用request库的HTTPCookieProcessor对象来创建cookie处理器
handler = request.HTTPCookieProcessor(cookie)
#通过handler来构建opener
opener = request.build_opener(handler)
#创建一个请求
response = opener.open("http://www.baidu.com")
#保存cookie到文件
#ignore_discard：cookie失效了也要保存
#ignore_expires：覆盖保存
cookie.save(ignore_discard=True,ignore_expires=True)

#利用cookie登录网站
from urllib import request
import urllib
from http import cookiejar

#定义文件名
filename = 'cookie02.txt'
#声明MozillacCookieJar对象保存cookie
cookie = cookiejar.MozillaCookieJar(filename)
#声明一个cookie处理器
handler = request.HTTPCookieProcessor(cookie)
#定义处理
opener = request.build_opener(handler)
#定义data、账号、密码
postdata = urllib.parse.urlencode({
    'username':'202042502008',
    'password':'HHF2714596503'
}).encode(encoding='UTF8')
#登录
loginUrl = "http://jwc.hnshzy.cn:90/hnshjw/cas/login.action"
#模拟登录
result = opener.open(loginUrl,postdata)
#保存cookie到文件
cookie.save(ignore_discard=True,ignore_expires=True)

#利用保存的cookie请求新网站
ner_url = 'http://jwc.hnshzy.cn:90/hnshjw/cas/login.action'
#请求新网页
try:
    result = opener.open(ner_url)
except request.HTTPError as e:
    if hasattr(e,'code'):
        print(e.code)
except request.URLError as e:
    if hasattr(e,'reason'):
        print(e.reason)
else:
    print(result.read())

4.正则表达式

from urllib import request
from bs4 import BeautifulSoup
import re
#定义正则表达式;r表示原生字符串
pattern = re.compile(r'hello')
#匹配字符,match函数:会从第一个开始匹配
result1 = re.match(pattern,"hello,world")
if result1:
    print(result1)

result1 = re.match(pattern,"hello,world").span()
if result1:
    print(result1)

result1 = re.match(pattern,"hello,world").group()
if result1:
    print(result1)



result2 = re.match(pattern,"hell,hello")
if result2:
    print(result2)
else:
    print("no!no!")

result2 = re.search(pattern,"hell,hello")
if result2:
    print(result2)

#定义正则规则
pattern = re.compile(r'a.c')
#匹配
resp = re.match(pattern,"abcdefj")
print(resp)

pattern2 = re.compile('a\.c')
resp2 = re.match(pattern2,"a.cd")
print(resp2)

pattern3 = re.compile('a\\\c')
resp3 = re.match(pattern3,"a\cd")
print(resp3)

#匹配部分字符
pattern4 = re.compile(r'a[a-f,A-F]c')
resp4 = re.match(pattern4,"afcdefg")
print(resp4)


#定义正则规则
re01  = re.compile(r'\d*')
#匹配
res01 = re.match(re01,'123456xxxxxxxxx')
print(res01)


re02  = re.compile(r'\d+\w')
#匹配
res02 = re.match(re02,'123456xxxxxxxxx')
print(res02)


re03  = re.compile(r'\d?\w')
#匹配
res03 = re.match(re03,'4xxxxxxxxx')
print(res03)


re04  = re.compile(r'1\d{5}')
#匹配
res04 = re.match(re04,'123456xxxxxxxxx')
print(res04)

re05  = re.compile(r'\d{5,11}@\w{2}\.\w{3}')
#匹配
res05 = re.match(re05,'123456789@qq.com')
print(res05)


#贪婪模式
re06 = re.compile(r'\w+')
res06 = re.match(re06,"dhakdhadlkadajdlkadjadjalkdja45343")
print(res06)
#解除贪婪模式
re06 = re.compile(r'\w+?')
res06 = re.match(re06,"dhakdhadlkadajdlkadjadjalkdja45343")
print(res06)

re06 = re.compile(r'\w{5,10}')
res06 = re.match(re06,"dhakdhadlkadajdlkadjadjalkdja45343")
print(res06)

re06 = re.compile(r'\w{5,10}?')
res06 = re.match(re06,"dhakdhadlkadajdlkadjadjalkdja45343")
print(res06)

re06 = re.compile(r'\w*?')
res06 = re.match(re06,"dhakdhadlkadajdlkadjadjalkdja45343")
print(res06)

#边界匹配
re07 = re.compile(r'^5678')
res07 = re.match(re07,'56789')
print(res07)

re08 = re.compile(r'789$')
res08 = re.search(re08,'56789')
print(res08)

re09 = re.compile(r'\A\w{1,6}')
res09 = re.search(re09,'dihakldjal12345678')
print(res09)

re10= re.compile(r'dja\Z')
res10 = re.search(re10,'dihakldja')
print(res10)

re11= re.compile(r'a\b!bc')
res11 = re.search(re11,'a!bc')
print(res11)


#逻辑分组
re12= re.compile(r'abc|efg')
res12 = re.search(re12,'abcdjklefg')
print(res12)

re13= re.compile(r'(abc){2}')
res13 = re.search(re13,'abcabchjkk')
print(res13)
re13= re.compile(r'(abc)(def)')
res13 = re.search(re13,'abcdefhjkk')
print(res13)

re14= re.compile(r'(?P<name>abc)')
res14 = re.search(re14,'abcdefg')
print(res14)

re15= re.compile(r'(\d)abc\1')
res15 = re.search(re15,'5abc5')
print(res15)

re16= re.compile(r'(?P<name>abc)efg(?P=name)')
res16 = re.search(re16,'abcefgabc')
print(res16)

posted @ 2022-03-12 10:32 小酒馆里的清茶阅读(47) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· Python简单爬取网页

· Python爬虫-xpath

· Python爬虫基础 _曾佳豪

· python爬虫简介

· python爬虫

公告

昵称：小酒馆里的清茶
园龄： 3年
粉丝： 4
关注： 4

+加关注

2025年3月

日

一

二

三

四

五

六

Python爬虫基础

公告

搜索

常用链接

随笔分类

随笔档案

阅读排行榜