Python爬虫 小白入门笔记

笔记来源

Day-0

1.如果你还不了解Python的基础语法,可以移步|>>>Python 基础 小白入门笔记<<<|或自行学习。

简介

1.什么是爬虫?
网络爬虫(又称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。另外一些不常使用的名字还有蚂蚁、自动索引、模拟程序或者蠕虫。
2.实质
模拟浏览器向服务器发送请求,把得到的数据拆分筛选后保存。
3.写爬虫程序的一般步骤
准备工作->获取数据->解析内容->保存数据

2.爬虫实例

graph LR U[爬取豆瓣Top250电影信息]
graph LR A[main主函数] A -->B[1.爬取网页函数] A -->D[2.解析数据函数] A -->E[3.保存数据函数] B -->f(del askURL) D -->g(del getdata) E -->h(del savedata) f -->|参数|i(跳转网址url) g -->|参数|j(基础网址baseurl) h -->|参数|k(datalist,savepath) i -->|功能|o(伪装浏览器请求访问指定的url,返回得到的html原码) j -->|功能|p(按正则表达式等匹配拆分筛选html原码,返回筛选结果) k -->|功能|q(把解析结果datalist写入指定路径savapath的文件中)

Day-1

Url访问请求模拟

#-*- codeing = utf-8 -*-
#@Time : 2020/7/12 20:27
#@Author : HUGBOY
#@File : hello_urllib.py
#@Software: PyCharm

#Urllib
'''----------------------|简介|------------------
1.urllib原理
2.模拟浏览器请求/get/post
----------------------------------------------'''
import urllib.request

#get请求

'''
response = urllib.request.urlopen("http://www.baidu.com")
 print(response.read().decode('utf-8'))
#response为一个对象/read()读取内容/decode('utf-8')中文编码
'''

#post请求(需要传入参数/hello : word)
'''
import urllib.parse
thedata = bytes(urllib.parse.urlencode({"hello":"word"}),encoding="utf-8")#转换为二进制数据包
response = urllib.request.urlopen("http://httpbin.org/post",data = thedata)
print(response.read().decode("utf-8"))
'''
#请求超时(多少秒内是否有返回)
'''
import urllib.parse
try:
    response = urllib.request.urlopen("http://httpbin.org/get",timeout=0.01)
    print(response.read().decode("utf-8"))
except Exception as rel:
    print("呀!请求超时啦,被发现了")
    print(rel)
'''

#访问状态
'''
import urllib.parse
response = urllib.request.urlopen("http://baidu.com/get",timeout=5)
#print(response.status)#返回状态码/200正常返回/418我是一个茶壶爬虫被发现了/404找不到
#print(response.getheaders())#获取请求头中的所有内容
print(response.getheader("Server"))#获得指定内容

'''

#伪装浏览器-测试(防止418)
'''
import urllib.parse
url = "http://httpbin.org/post"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0"}
data = bytes(urllib.parse.urlencode({'name':'hugboy'}),encoding="utf-8")
req = urllib.request.Request(url=url,data=data,headers=headers,method="POST")
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
'''
#伪装浏览器-实战(防止418)
import urllib.parse
url = "http://www.douban.com"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0"}
req = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))

re正则表达式

#-*- codeing = utf-8 -*-
#@Time : 2020/7/13 9:41
#@Author : HUGBOY
#@File : hello_re.py
#@Software: PyCharm
'''-------------简介-----------------
正则表达式
1.re
2.sub
----------------------------------'''
#正则表达式

#re

import re
print("创建模式对象")

pat = re.compile("aa")#标准
r = pat.search("aa123aabc")#校验

print(r)
#<re.Match object; span=(0, 2), match='aa'> search查找只匹配第一个

print("无需创建对象")
r = re.search(".com","www.heihei.com")#"标准","数据"
print(r)


print("找到并输出")
print(re.findall("a","a site named www.heihei.com hahahha"))
print(re.findall("[A-Z]","HugBoy"))
print(re.findall("[a-z]+","HugBoy"))

#sub

print("找到a用A替换")
print(re.sub("a","A","I am a vactory man!"))

print("抵消转译")
a = r"\n 'abc\'"
print(a)

Day-2

bs4树形结构查询

#-*- codeing = utf-8 -*-
#@Time : 2020/7/12 22:25
#@Author : HUGBOY
#@File : hello_bs4.py
#@Software: PyCharm

'''----------------------|简介|------------------
1.将复杂的HTML文档转换成一个复杂的树形结构,
每个节点都是Python对象,所有对象都可以归纳为4中
BeautifulSoup4
- Tag
- NavigableString
- BeautifulSoup
- Comment
2.遍历,搜索,css选择器
----------------------------------------------'''

from bs4 import BeautifulSoup
f = open("./baidu.html","rb")#文件内容见 [附:day-2|baidu.html]
html = f.read()
rel = BeautifulSoup(html,"html.parser")#把html解析为树形结构



print("- Tag 标签及其内容(只拿到匹配到的第一个标签)")

#网页中的title <title>百度一下,你就知道 </title>
print(rel.title)

#网页中的超链接a <a class="mnav" href="http://news.baidu.com" name="tj_trnews"><!--新闻--></a>
print(rel.a)

print(type(rel.a))#<class 'bs4.element.Tag'>



print("- NavigableString  标签里的内容(字符串)")

print(rel.title.string)

print(rel.a.string)

print(rel.a.attrs)#字典形式-一个标签内所有属性


print("- BeautifulSoup 整个为文档")

print(type(rel))#运行结果 <class 'bs4.BeautifulSoup'>
print(rel)


print("- Comment 注释符号里的内容")
#<a class="mnav" href="http://news.baidu.com" name="tj_trnews"><!--新闻--></a>
print(rel.a.string)
print(type(rel.a.string))#运行结果 <class 'bs4.element.Comment'>










#遍历文档
print("标签里的内容,按列表格式取出")
print(rel.head.contents)
print("列表元素")
print(rel.head.contents[1])




#*搜索文档

#1

print(" find_all()字符串过滤:查找与字符串完全匹配的内容")
t_list = rel.find_all("a")
print(t_list)

print("search() 正则表达式")
import re
t_list = rel.find_all(re.compile("a"))#所有标签中含'a'字母的标签及其字内容
print(t_list)

print("根据传入的函数要求搜索/搜索有name属性的标签")
def name_is(tag):
    return tag.has_attr("name")

t_list = rel.find_all(name_is)
print(t_list)

print("优雅的打印下,哈哈哈")
for one in t_list:
    print(one)

#2

print("含参数")
t_list = rel.find_all(id = "head")
for one in t_list:
    print(one)
print("有class")
t_list = rel.find_all(class_ = True)
for one in t_list:
    print(one)
print("指定某个")
t_list = rel.find_all(href="http://map.baidu.com")
for one in t_list:
    print(one)


#3

print("text参数")
#t_list = rel.find_all(text = "hao123")
t_list = rel.find_all(text = ["hao123","地图","贴吧","有吗"])
for one in t_list:
    print(one)

print("正则表达式查包含特定文本内容(标签里的字符串)/如查所有含数字的")
t_list = rel.find_all(text = re.compile("\d"))
for one in t_list:
    print(one)


print("limit限定个数")
t_list = rel.find_all("a",limit=3)
for one in t_list:
    print(one)


#css选择器

print("通过标签查照")
t_list = rel.select('title')
for one in t_list:
    print(one)

print("通过类名照")
t_list = rel.select(".mnav")
for one in t_list:
    print(one)

print("通过id查找")
t_list = rel.select("#u1")
for one in t_list:
    print(one)

print("通过属性值查找")
t_list = rel.select("a[class='bri']")
for one in t_list:
    print(one)

print("查找head里的title(通过子标签)")
t_list = rel.select("head>title")
for one in t_list:
    print(one)

print("查找兄弟标签(与某标签同级标签)")
t_list = rel.select(".mnav ~ .bri")
for one in t_list:
    print(one)
print("获得其中的文本")
print(t_list[0].get_text())

[附:day-2|baidu.html]

<!DOCTYPE html>
<html>
<head>
    <meta content="text/html;charset=utf-8" http-equiv="content-type" />
    <meta content="IE=Edge" http-equiv="X-UA-Compatible" />
    <meta content="always" name="referrer" />
    <link href="https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/bdorz/baidu.min.css" rel="stylesheet" type="text/css" />
    <title>百度一下,你就知道 </title>
</head>
<body link="#0000cc">
  <div id="wrapper">
    <div id="head">
        <div class="head_wrapper">
          <div id="u1">
            <a class="mnav" href="http://news.baidu.com" name="tj_trnews"><!--新闻123--></a>
            <a class="mnav" href="http://news.baidu.com" name="tj_trnews">新闻1</a>
            <a class="mnav" href="https://www.hao123.com" name="tj_trhao123">hao123</a>
            <a class="mnav" href="http://map.bai521314du.com" name="tj_trmap">地图</a>
            <a class="mnav" href="http://v.baidu.com" name="tj_trvideo">视频</a>
            <a class="mnav" href="http://tieba.baidu.com" name="tj_trtieba">贴吧</a>
            <a class="bri" href="//www.baidu.com/more/" name="tj_briicon" style="display: block;">更多产品 </a>
          </div>
        </div>
    </div>
  </div>
</body>
</html>

Day-3

爬取豆瓣电影Top250数据

爬取效果


[源代码]

本例在Excel保存数据,另|>>>附:Python 爬虫 数据库保存数据<<<|

# -*- codeing = utf-8 -*-
# @Time : 2020/7/12 19:11
# @Author : HUGBOY
# @File : sp_douban.py
# @Software: PyCharm
'''----------------------|简介|------------------
#爬虫
#爬取豆瓣TOP250电影数据
#1.爬取网页
#2.逐一解析数据
#3.保存数据(xlwt|sqlite)
----------------------------------------------'''
from bs4 import BeautifulSoup  # 网页解析、获取数据
import re  # 正则表达式
import urllib.request, urllib.error  # 指定URL、获取网页数据
import random
import xlwt  # 存到excel的操作 详见[附:day-3|xlwt]
import sqlite3  # 存到数据库操作


def main():
    baseurl = "https://movie.douban.com/top250/?start="
    datalist = getdata(baseurl)
    savepath = ".\\豆瓣电影Top250_data.xls"
    savedata(datalist,savepath)


# 正则表达式匹配规则

findTitle = re.compile(r'<span class="title">(.*)</span>')  # 影片片名
findR = re.compile(r'<span class="inq">(.*)</span>')  # 一句话评
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')  # 影片评分
findPeople = re.compile(r'<span>(\d*)人评价</span>')  # 影评人数-/d 代表数字
findLink = re.compile(r'<a href="(.*?)">')  # 影片链接
findImg = re.compile(r'<img.*src="(.*?)"', re.S)  # 影片图片-re.S 允许.中含换行符
findBd = re.compile(r'<p class="">(.*?)</p>', re.S)  # 影片简介


def getdata(baseurl):
    datalist = []
    for i in range(0, 10):  # 调用获取页面信息的函数*10次
        url = baseurl + str(i * 25)
        html = askURL(url)  # 保存获取到的网页原码
        # 解析网页
        soup = BeautifulSoup(html, "html.parser")
        for item in soup.find_all('div', class_="item"):
            # print(item) #一部电影的所有信息
            data = []
            item = str(item)
            # 提取影片详细信息

            title = re.findall(findTitle, item)
            if (len(title) == 2):
                ctitle = title[0]  # 中文名
                data.append(ctitle)
                otitle = title[1].replace("/", "")  # 外文名-去掉'/'和""
                data.append(otitle)
            else:
                data.append(title[0])
                data.append("无外文名")
            img = re.findall(findImg, item)[0]
            data.append(img)
            link = re.findall(findLink, item)[0]  # re库:正则表达式找指定字符串
            data.append(link)
            rating = re.findall(findRating, item)[0]
            data.append(rating)
            people = re.findall(findPeople, item)[0]
            data.append(people)

            r = re.findall(findR, item)
            if len(r) != 0:
                r = r[0].replace("。", "")
                data.append(r)
            else:
                data.append("无一句评")

            bd = re.findall(findBd, item)[0]
            bd = re.sub('<br(\s+)?/>(\s+)?', " ", bd)  # 替换</br>
            bd = re.sub('/', " ", bd)  # 替换/
            data.append(bd.strip())  # 去掉空格

            datalist.append(data)  # 把一部电影信息存储

    print(datalist)
    return datalist


# 得到指定Url网页内容
def askURL(url):


    head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0"}
    request = urllib.request.Request(url, headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
        #print(html) 测试
    except Exception as rel:
        if hasattr(rel, "code"):
            print(rel.code)
        if hasattr(rel, "reason"):
            print(rel.reason)
    return html


def savedata(datalist,savepath):
    book = xlwt.Workbook(encoding="utf-8",style_compression=0)#样式压缩效果
    sheet = book.add_sheet('DouBanMovieTop250',cell_overwrite_ok=True)#覆盖写
    col = ("【影片中文名】","【影片外国名】","【电影封面】","【观看链接】","【评分】","【评价数】","【一句神评】","【内容简介】")
    for i in range(0,8):
        sheet.write(0,i,col[i])#表头名
    for i in range(0,250):
        print("正在写入第%d条..."%(i+1))
        data = datalist[i]
        for k in range(0,8):
            sheet.write(i+1,k,data[k])

    print("正在保存...")
    book.save(savepath)



if __name__ == "__main__":
    main()
    print("爬取完成,奥利给!")

[附:day-3|xlwt]保存在Excel

运行效果

#-*- codeing = utf-8 -*-
#@Time : 2020/7/13 14:48
#@Author : HUGBOY
#@File : hello_xlwt.py
#@Software: PyCharm



'''
import xlwt

workbook = xlwt.Workbook(encoding="utf-8")  #创建workbook(xlwt)对象
worksheet = workbook.add_sheet('sheet')  #创建工作表
worksheet.write(0,0,'hello word,my baby sheet !') #写参数-/行/列/内容
workbook.save('九九乘法表.xls') #保存
'''
#练习 -把九九乘法表写入Excel表格中

#1while
'''

import xlwt

workbook = xlwt.Workbook(encoding="utf-8")  #创建workbook(xlwt)对象
worksheet = workbook.add_sheet('sheet')  #创建工作表
m = 0
while m <= 9:
    m+=1
    n=1
    while n <= m:
            #print("%d * %d = %d"%(m,n,m*n),end=' ')
            worksheet.write(m-1, n-1,"%dx%d=%d"%(m,n,m*n))  # 写参数-/行/列/内容
            n += 1
    else:
        worksheet.write(m-1, n-1," ")
        continue

workbook.save('九九乘法表.xls') #保存
'''
#2for
import xlwt

workbook = xlwt.Workbook(encoding="utf-8")  #创建workbook(xlwt)对象
worksheet = workbook.add_sheet('sheet')  #创建工作表

for m in range(1,10):
    for n in range(1,m+1):
        worksheet.write(m-1,n-1,"%d * %d = %d"%(m,n,m*n))

workbook.save('九九乘法表.xls') #保存
posted @ 2020-07-12 18:05  HUGBOY  阅读(395)  评论(0编辑  收藏  举报