python爬虫笔记(2、爬取豆瓣top250、写入数据库、数据可视化)
一、爬取豆瓣top250的数据
#coding:utf-8 import urllib.request import urllib.parse import re from bs4 import BeautifulSoup import xlwt """ urllib.request.Request(url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None) data:字节流编码格式(可以用urllib.parse.urlencode()和bytes()方法将参数转化为字节流编码格式的内容)。如果要使用data参数,则请求方式为POST。 origin_req_host:指定请求方的host名称或者ip地址 unverifiable:设置网页是否需要验证,默认是False,这个参数一般也不用设置。 method:method是一个字符串,用来指定请求使用的方法,比如GET,POST和PUT等。 """ # url = 'https://www.douban.com' # header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36"} # #class bytes([source[, encoding[, errors]]]) # content = bytes(urllib.parse.urlencode({"name":"wang"}),encoding="utf-8") # response = urllib.request.Request(url=url,data=content,headers=header,method="POST") # reasult = urllib.request.urlopen(response) # print(reasult.read().decode("utf-8")) #正则表达式 # <a href="https://movie.douban.com/subject/1306249/" class=""> # r"" 的作用是去除转义字符. findLink = re.compile(r'<a href="(.*?)">') #<img width="100" alt="肖申克的救赎" src="https://img2.doubanio.com/view/photo/s_ratio_poster/public/p480747492.webp" class=""> findImg = re.compile(r'<img .*src="(.*?)".*>',re.S) #re.S让换行符包含在匹配中 findName = re.compile(r'<span class="title">(.*)</span>') findGrade = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>') findNum = re.compile(r'<span>(\d*)人评价</span>') findInq = re.compile(r'<span class="inq">(.*)</span>') findOther = re.compile(r'<p class="">(.*?)</p>',re.S) def main(): #1.爬取网页 baseurl = 'https://movie.douban.com/top250?start=' dataList = getData(baseurl) savePath = ".\\top250.xls" saveData(dataList,savePath) #2.保存数据 #爬取网页 def getData(url): dataList = [] for i in range(0,10): baseurl = url+str(i*25) html = askUrl(baseurl) #2.逐一解析数据 soup = BeautifulSoup(html,"html.parser") for item in soup.find_all("div",class_ = "item"): item = str(item) data = [] #保存一部电影的所有信息 #影片详情链接 link = re.findall(findLink,item)[0] data.append(link) #影片图片的链接 img = re.findall(findImg,item)[0] data.append(img) # 影片名 name = re.findall(findName, item) if len(name) == 2: cname = name[0] data.append(cname) oname = name[1].replace("/","").replace("\xa0","") data.append(oname) else: cname = name[0] data.append(cname) data.append(" ") #电影评分 grade = re.findall(findGrade,item)[0] data.append(grade) #影片评价人数 number = re.findall(findNum,item)[0] data.append(number) #影片简介 introduction = re.findall(findInq,item) if len(introduction) != 0: intro = introduction[0].replace("。"," ") #去掉句号 data.append(intro) else: data.append(" ") #影片其他信息 otherInfo = re.findall(findOther,item)[0] # re.sub(pattern, repl, string, count=0, flags=0) otherInfo = re.sub(r'<br(\s+)?/>(\s)?'," ",otherInfo) otherInfo = re.sub("/"," ",otherInfo) data.append(otherInfo.strip()) dataList.append(data) return dataList def saveData(dataList,savePath): # 创建一个workbook,并设置编码 workbook = xlwt.Workbook(encoding="utf-8") # 创建一个worksheet worksheet = workbook.add_sheet("豆瓣电影top250",cell_overwrite_ok=True) setList = ("地址链接","图片链接","电影中文名","电影外文名","评分","评价人数","简介","电影其他信息") #第一行显示的内容 for i in range(0,8): worksheet.write(0,i,setList[i]) for i in range(0,250): data = dataList[i] for j in range(0,8): worksheet.write(i+1,j,data[j]) #保存 workbook.save(savePath) #得到指定网页的全部内容 def askUrl(url): header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36"} response = urllib.request.Request(url,headers=header) try: result = urllib.request.urlopen(response) html = result.read().decode("utf-8") #获得网页内容 except urllib.error.URLError as e: if(hasattr(e,"code")): print(e.code) if (hasattr(e,"error")): print(e.error) return html if __name__ == "__main__": main()
二、将爬取的数据写入到数据库中
#coding:utf-8 from bs4 import BeautifulSoup #用来代替正则式取源码中相应标签中的内容 import urllib.parse import urllib.request import re import mysql.sqlExecute as sqlExecute #正则表达式 # <a href="https://movie.douban.com/subject/1306249/" class=""> # r"" 的作用是去除转义字符. findLink = re.compile(r'<a href="(.*?)">') #<img width="100" alt="肖申克的救赎" src="https://img2.doubanio.com/view/photo/s_ratio_poster/public/p480747492.webp" class=""> findImg = re.compile(r'<img .*src="(.*?)".*>',re.S) #re.S让换行符包含在匹配中 findName = re.compile(r'<span class="title">(.*)</span>') findGrade = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>') findNum = re.compile(r'<span>(\d*)人评价</span>') findInq = re.compile(r'<span class="inq">(.*)</span>') findOther = re.compile(r'<p class="">(.*?)</p>',re.S) def main(): #准备url地址 url = "https://movie.douban.com/top250?start=" #1.获取指定网页内容 html = askUrl(url) # 2.获取数据并处理后,以数组的形式准备 dataList = getData(url) # 3.将数据保存到数据库中 result = saveData(dataList) print(result) # 第一步:得到指定网页的内容 def askUrl(url): #准备 header = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.9231 SLBChan/30' } response = urllib.request.Request(url, headers=header) try: result = urllib.request.urlopen(response) html = result.read().decode("utf-8") '''code 一个 HTTP 状态码 这个数字的值对应于存放在http.server.BaseHTTPRequestHandler.responses代码字典中的某个值 reason 这通常是一个解释本次错误原因的字符串 headers 导致 HTTPError 的特定 HTTP 请求的 HTTP 响应头 ''' except urllib.error as e: #HTTPError是URLError的子库 # code、reason、headers if(hasattr(e,"code")): print(e.code) if(hasattr(e,"reason")): print(e.reason) return html # 第二步:爬取相应的网页,并解析数据,准备数据:data[(1,2,3),(4,5,6)] def getData(url): dataList = [] for i in range(0,10): baseUrl = url + str(i*25) html = askUrl(baseUrl) # 逐一解析数据 soup = BeautifulSoup(html,'html.parser') for item in soup.find_all('div',class_ = 'item'): data = () #保存一部电影的详细信息 # print(type(item)) :bs4.element.Tag item = str(item) # 影片详情链接 link = re.findall(findLink, item)[0] data = data + (str(link),) # 影片图片的链接 img = re.findall(findImg, item)[0] data = data + (str(img),) # 影片名 name = re.findall(findName, item) if len(name) == 2: cname = name[0] data = data + (str(cname),) oname = name[1].replace("/", "").replace("\xa0", "") data = data + (str(oname),) else: cname = name[0] data = data + (str(cname),) data = data + (" ",) # 电影评分 grade = re.findall(findGrade, item)[0] data = data + (str(grade),) # 影片评价人数 number = re.findall(findNum, item)[0] data = data + (str(number),) # 影片简介 introduction = re.findall(findInq, item) if len(introduction) != 0: intro = introduction[0].replace("。", " ") # 去掉句号 data = data + (str(intro),) else: data = data + (" ",) # 影片其他信息 otherInfo = re.findall(findOther, item)[0] # re.sub(pattern, repl, string, count=0, flags=0) otherInfo = re.sub(r'<br(\s+)?/>(\s)?', " ", otherInfo) otherInfo = re.sub("/", " ", otherInfo) data = data + ((otherInfo.strip()),) dataList.append(data) return dataList #第三步: 将数据保存到数据库中 def saveData(dataList): sqlDb = sqlExecute.MysqlDb() sql = "insert into movie_top(movie_link,movie_pic,movie_name,movie_foreign,movie_grade,movie_comment_num,movie_intro,movie_other) values(%s,%s,%s,%s,%s,%s,%s,%s)" result = sqlDb.execute_sql(sql,dataList) return result if __name__ == "__main__": main()
三、数据可视化(可参考下面例子)
from pyecharts.charts import Bar bar = Bar() bar.add_xaxis(["衬衫", "羊毛衫", "雪纺衫", "裤子", "高跟鞋", "袜子"]) bar.add_yaxis("商家A", [5, 20, 36, 10, 75, 90]) # render 会生成本地 HTML 文件,默认会在当前目录生成 render.html 文件 # 也可以传入路径参数,如 bar.render("mycharts.html") bar.render()
最终数据: