Python爬虫练习(requests模块)
Python爬虫练习(requests模块)
关注公众号“轻松学编程”了解更多。
一、使用正则表达式解析页面和提取数据
1、爬取动态数据(js格式)
爬取http://fund.eastmoney.com/fund.html
流程:
######a.分析页面
用浏览器打开链接,清空已加载的数据,点击下一页,可看到动态数据已被封装成js格式:
var db = {...}
######b.获取url
######c.获取响应
######d.使用正则表达式清洗数据
######e.转为二维列表
f.遍历
import re
import requests
'''
http://fund.eastmoney.com/Data/Fund_JJJZ_Data.aspx?t=1&lx=1&
letter=&gsid=&text=&sort=zdf,desc&page=3,200&dt=1530794512554&atfc=&onlySale=0
'''
def getFund(page):
'''
获取js格式数据
:param page: 页码
:return:
'''
#设置url
url = '''http://fund.eastmoney.com/Data/Fund_JJJZ_Data.aspx?t=1&lx=1&letter=&
gsid=&text=&sort=zdf,desc&page=%d,200&dt=1530794512554&atfc=&onlySale=0
'''%page
# 获取响应
response = requests.get(url).content.decode("utf-8")
# print(response)
# d.使用正则表达式清洗数据
datare = ".*datas:(.*),count"
datas = re.findall(datare,response)
# 转为二维列表
datas = eval(datas[0])
# print(datas)
# 遍历
for data in datas:
print(data)
if __name__ == '__main__':
# 获取基金数据,
page = input("输入页码:")
getFund(int(page))
二、使用bs4解析页面和提取数据
1、抓取抓取51job岗位数量
流程:a.设置URL,请求头 b.获取响应 c.创建bs4对象 d.找到岗位数量标签类名
import requests
from bs4 import BeautifulSoup
# 设置请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
# 设置获取python岗位数量的url
url = "https://search.51job.com/list/030200,000000,0000,00,9,99,python,2,1.html"
# 获取响应
response = requests.get(url,headers=headers)
# 解码
html = response.content.decode('gbk')
# 创建bs4对象
soup = BeautifulSoup(html,'lxml')
#获取岗位标签
#通过类名rt找到岗位数量标签
jobNum = soup.select('.rt')[0].text
print(jobNum.strip())
输出:
共3506条职位
2、抓取基金
流程:a.设置URL,请求头 b.获取响应 c.创建bs4对象 d.找到基金列表类名 e.保存为csv文件
import csv
import requests
from bs4 import BeautifulSoup
# 设置请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
# 抓取全部,翻页
for i in range(1,2):
# 设置URL
url = "http://quote.stockstar.com/fund/stock_3_1_%d.html" % i
# 获取响应
response = requests.get(url,headers=headers)
# < meta
# http - equiv = "Content-Type"
# content = "text/html; charset=gb2312" >
# 编码格式为gb2312 ,即中文简体
# 而gbk编码包含简体和繁体,所以解码时使用gbk
html = response.content.decode('gbk')
soup = BeautifulSoup(html,'lxml')
# 获取当前页的全部基金
stackList = soup.find('tbody',id='datalist').find_all('tr')
print(len(stackList))
for stack in stackList:
# 基金代号 #datalist > tr.hoverTr > td:nth-child(1) > a
stackId = stack.select("td:nth-of-type(1) > a")[0].text
# 基金简称 #datalist > tr:nth-child(1) > td:nth-child(2) > a
stackName = stack.select("td:nth-of-type(2) > a")[0].text
# url
stackUrl = stack.select("td:nth-of-type(1) > a")[0]["href"]
# 基金净值 #datalist > tr:nth-child(1) > td:nth-child(3)
stackMoney = stack.select("td:nth-of-type(3)")[0].text
print(stackId,stackName,stackUrl,stackMoney)
#写入csv文件
with open('./fund.csv','a+',newline='',encoding='utf-8',errors='ignore') as f:
writer = csv.writer(f)
writer.writerow([stackId,stackName,stackUrl,stackMoney])
3、把数据存入数据库(MySQL)
把fund.csv存入数据库中。
流程:a.创建fund数据库 b.连接数据库 c.创建游标 d.读取文本 e.循环写入文本 f.创建sql插入语句 g.执行sql语句,提交 h.关闭文件、游标、数据库连接
import pymysql
# 连接数据库
conn = pymysql.connect(
host='127.0.0.1', user='root', password="123456",
database='fund', port=3306,
charset='utf8'
)
# 创建游标
cursor = conn.cursor()
# 读取文本
stockData = open('fund.csv','r',encoding='utf-8')
# 循环写入文本
for stock in stockData.readlines():
s = stock.split(',')
print(s)
# 由于最后一个字符带有换行符,所以要去掉
print(s[3].strip())
#创建sql插入语句 %r自动给字符串加上单引号
sql = "insert into stock(stackId,stackName,stackUrl,stackMoney) values (%r,%r,%r,%r)" \
%(s[0],s[1],s[2],s[3].strip())
#执行sql语句,提交
cursor.execute(sql)
conn.commit()
# 关闭文件、游标、数据库连接
stockData.close()
cursor.close()
conn.close()
4、抓取岗位
流程:a.设置种子url获取总页数 b.循环获取每页的数据 c.获取响应 d.创建bs4对象 e.匹配符合条件的属性,获取岗位信息列表
import requests
from bs4 import BeautifulSoup
# 设置请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
def getJobPageTotal(url):
'''
获取岗位总页数
:param url: 种子url
:return: 总页数
'''
return 2
def getJobDetial(url):
'''
获取岗位细节
:param url:
:return: (地点,类型,招聘人数,职责,要求)
'''
response = requests.get(url,headers=headers).text
soup = BeautifulSoup(response,'lxml')
# #position_detail > div > table > tbody > tr.c.bottomline > td:nth-child(1)
# position_detail > div > table > tbody > tr.c.bottomline
# 地点 类型 招聘人数 职责 要求
jobAddr = soup.select('tr.c.bottomline > td:nth-of-type(1)')[0].text.strip('工作地点:')
jobType = soup.select('tr.c.bottomline > td:nth-of-type(2)')[0].text.strip('职位类别:')
jobPNum = soup.select('tr.c.bottomline > td:nth-of-type(3)')[0].text.strip('招聘人数:')
jobDuty = soup.find_all(class_='squareli')[0].text
jobDuReq = soup.find_all(class_='squareli')[1].text
# print(jobAddr,jobType,jobPNum,jobDuty,jobDuReq)
return jobAddr,jobType,jobPNum,jobDuty,jobDuReq
def getJobInfo(url):
'''
获取岗位信息
:param url:
:return:
'''
# 获取响应
response = requests.get(url,headers=headers).text
# print(response)
#创建bs4对象
soup = BeautifulSoup(response,'lxml')
#匹配符合条件的属性
# [,]匹配所有符合条件的属性
jobList = soup.find_all('tr',class_=['even','add'])
# print(jobList)
for job in jobList:
# tr:nth-child(2) > td.l.square > a 获取岗位名称
jobName = job.select('td:nth-of-type(1) > a')[0].text
#获取岗位url
jobUrl = r"https://hr.tencent.com/" + job.select('td:nth-of-type(1) > a')[0]["href"]
#获取岗位细节
jobDetail = getJobDetial(jobUrl)
print(jobName,jobUrl,jobDetail)
if __name__ == '__main__':
# 设置种子url获取总页数 获取职位
positions = ['python']
url = "https://hr.tencent.com/position.php?keywords=%s" % positions[0]
pageTotal = getJobPageTotal(url)
# 循环获取每页的数据
for i in range(pageTotal):
# https: // hr.tencent.com / psition.php?keywords = python & start = 20 # a
#点击下一页时,start加10,每页显示10条数据
url = r"https://hr.tencent.com/position.php?keywords=%s&start=%d#a"%(positions[0],i*10)
getJobInfo(url)
三、使用xpath解析页面和提取数据
1、抓取51城市列表
流程:a.设置请求体、url b.获取响应 c.创建xml树形结构对象 d.匹配城市列表类名,获取城市列表
e.遍历城市列表
import requests
import lxml
from lxml import etree
# 设置请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
# 设置url
url = "https://jobs.51job.com/"
# 获取响应
response = requests.get(url,headers=headers).content.decode('gbk')
# print(response)
mytree = lxml.etree.HTML(response)
# 匹配城市列表类名,获取城市列表
# /html/body/div[2]/div[2]/div[2]/div[1]
cityList = mytree.xpath('//div[@class="e e4"][1]//div[@class="lkst"]/a')
# print(cityList)
for city in cityList:
cityName = city.xpath('./text()')[0]
cityUrl = city.xpath('./@href')[0]
print(cityName,cityUrl)
2、抓取51岗位信息
流程:a.设置请求体、url b.获取响应 c.创建xml树形结构对象 d.匹配岗位列表类名,获取岗位列表
e.遍历岗位列表
import requests
import lxml
from lxml import etree
# 设置请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
def getJobInfo(url):
'''
获取岗位信息
:param url:
:return:
'''
#获取响应
response = requests.get(url,headers=headers).content.decode('gbk')
# print(response)
#创建xml树形结构
mytree = lxml.etree.HTML(response)
#匹配岗位列表类名,获取岗位列表
jobList = mytree.xpath('//div[@class="detlist gbox"]/div')
# print(jobList)
for job in jobList:
# e.遍历岗位列表
# 岗位名
# / html / body / div[3] / div[3] / div[1] / div[2] / div[1] / p[1] / span[1]
jobName = job.xpath('.//span[@class="title"]/a/@title')[0]
# url
jobUrl = job.xpath('.//span[@class="title"]/a/@href')[0]
# 公司名
company = job.xpath('.//p[@class="info"]/a/@title')[0]
# 工作地点
jobAddr = job.xpath('.//p[@class="info"]/span[@class="location name"]/text()')[0]
# 薪资
money = job.xpath('.//p[@class="info"]/span[3]/text()')[0]
# 要求
orders = job.xpath('.//p[@class="order"]/text()')[0]
order = ""
for i in orders:
order += i.strip()
# 工作描述
content = job.xpath('.//p[@class="text"]/text()')[0]
print(jobName, jobUrl, company, jobAddr, money, order)
print(content)
if __name__ == '__main__':
url = "https://jobs.51job.com/guangzhou/"
getJobInfo(url)
3、抓取51岗位页数
流程:a.设置请求体、url b.获取响应 c.创建xml树形结构对象 d.匹配总页数类名 e.正则获取数量
import re
'''
a.设置请求体、url b.获取响应 c.创建xml树形结构对象 d.匹配总页数类名
'''
import requests
import lxml
from lxml import etree
# 设置请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
def getJobTotalPage(url):
'''
获取岗位总页数
:param url:
:return: int 总页数
'''
# 获取响应
response = requests.get(url, headers=headers).content.decode('gbk')
# print(response)
# 创建xml树形结构
mytree = lxml.etree.HTML(response)
# 匹配岗位列表类名,获取岗位列表
totalPage = mytree.xpath('//*[@id="cppageno"]/span[1]/text()')[0]
# 正则匹配数量
totalPage = re.findall('.*共(\d*)页。*',totalPage)[0]
print(totalPage)
return int(totalPage)
if __name__ == '__main__':
url = "https://jobs.51job.com/beijing/"
getJobTotalPage(url)
4、抓取51job
流程:a.设置种子url b.获取城市列表 c.循环城市列表 d.获取总页数 e.获取岗位信息
import csv
import time
import requests
import lxml
from lxml import etree
import re
'''
a.设置种子url b.获取城市列表 c.循环城市列表 d.获取总页数 e.获取岗位信息
'''
# 设置请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
def getCityList(url):
'''
获取城市列表
:param url:
:return: {"cityName":"cityUrl"}
'''
# 获取响应
response = requests.get(url, headers=headers).content.decode('gbk')
# print(response)
mytree = lxml.etree.HTML(response)
# 匹配城市列表类名,获取城市列表
# /html/body/div[2]/div[2]/div[2]/div[1]
cityList = mytree.xpath('//div[@class="e e4"][1]//div[@class="lkst"]/a')
# print(cityList)
cityDict = {}
for city in cityList:
cityName = city.xpath('./text()')[0]
cityUrl = city.xpath('./@href')[0]
cityDict[cityName] = cityUrl
return cityDict
def getPageTotal(url):
'''
获取总页数
:param url:
:return: 总页数
'''
# 获取响应
response = requests.get(url, headers=headers).content.decode('gbk')
# print(response)
# 创建xml树形结构
mytree = lxml.etree.HTML(response)
# 匹配岗位列表类名,获取岗位列表
totalPage = mytree.xpath('//*[@id="cppageno"]/span[1]/text()')[0]
# 正则匹配数量
totalPage = re.findall('.*共(\d*)页。*', totalPage)[0]
# print(totalPage)
return int(totalPage)
def getJobInfo(cityName,cityUrl,pageTotal):
'''
获取岗位信息
:param cityName: 城市名称
:param cityUrl: 城市url
:param pageTotal: 总页数
:return:
'''
for i in range(1,pageTotal+1):
#设置url
url = cityUrl + "p%d" %i
# 获取响应
response = requests.get(url, headers=headers).content.decode('gbk')
# print(response)
# 创建xml树形结构
mytree = lxml.etree.HTML(response)
# 匹配岗位列表类名,获取岗位列表
jobList = mytree.xpath('//div[@class="detlist gbox"]/div')
# print(jobList)
if jobList:
for job in jobList:
# e.遍历岗位列表
# 岗位名
# / html / body / div[3] / div[3] / div[1] / div[2] / div[1] / p[1] / span[1]
jobName = job.xpath('.//span[@class="title"]/a/@title')[0]
# url
jobUrl = job.xpath('.//span[@class="title"]/a/@href')[0]
# 公司名
company = job.xpath('.//p[@class="info"]/a/@title')[0]
# 工作地点
jobAddr = job.xpath('.//p[@class="info"]/span[@class="location name"]/text()')[0]
# 薪资
money = job.xpath('.//p[@class="info"]/span[3]/text()')
if money:
money = money[0]
else:
money = "面谈"
# 要求
orders = job.xpath('.//p[@class="order"]/text()')[0]
order = ""
for i in orders:
order += i.strip()
# 工作描述
content = job.xpath('.//p[@class="text"]/text()')[0]
print(url)
# print(jobName, jobUrl, company, jobAddr, money, order)
# print(content)
# 写入文件
with open('./data/'+ cityName+ '.csv','a+',encoding='utf-8',errors='ignore') as f:
writer = csv.writer(f)
writer.writerow([jobName, jobUrl, company, jobAddr, money, order,content])
if __name__ == '__main__':
#设置种子url
starUrl = "https://jobs.51job.com/"
#获取城市列表
cityList = getCityList(starUrl)
# 循环城市列表
for cityName,cityUrl in cityList.items():
# d.获取总页数
pageTotal = getPageTotal(cityUrl)
time.sleep(1)
# 获取岗位信息
getJobInfo(cityName,cityUrl,pageTotal)
5、爬取Jsp页面
流程:a.设置种子url b.设置表单数据 c.获取响应 d.创建xml树形结构 e.获取总页数 f.循环获取每页数据 g.设置动态的表单数据 h.获取响应
import requests
import lxml
import math
from lxml import etree
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
# 设置种子url
starUrl = "http://www.hshfy.sh.cn/shfy/gweb2017/ktgg_search_content.jsp"
# 设置表单数据
data = {
"yzm": "yWFF",
"ft": "",
"ktrqks": "2018-07-07",
"ktrqjs": "2018-08-07",
"spc": "",
"yg": "",
"bg": "",
"ah": "",
"pagesnum": "1",
}
# 获取响应
response = requests.get(starUrl, data=data, headers=headers).text
# print(response)
# 创建xml树形结构
mytree = lxml.etree.HTML(response)
# 获取总页数
# //*[@id="ktgg_search_content"]/div
# //*[@id="ktgg_search_content"]/div/div/font/strong
pageTotal = int(mytree.xpath('/html/body/div/div/font/strong/text()')[0])
pageTotal = math.ceil(pageTotal / 15)
# print(pageTotal)
# 循环获取每页数据
for i in range(1,pageTotal+1):
# 设置表单数据
data = {
"yzm": "yWFF",
"ft": "",
"ktrqks": "2018-07-07",
"ktrqjs": "2018-08-07",
"spc": "",
"yg": "",
"bg": "",
"ah": "",
"pagesnum": i,
}
url = "http://www.hshfy.sh.cn/shfy/gweb2017/ktgg_search_content.jsp"
# 获取响应
response = requests.get(url, data=data, headers=headers).text
# print(response)
# 创建xml树形结构
mytree = lxml.etree.HTML(response)
# // *[ @ id = "report"] / tbody / tr[1] / td[1] / b
reportList = mytree.xpath('//*[@id="report"]/tbody//tr[position()>1]')
for report in reportList:
# 法院
fy = report.xpath('td[1]/font/text()')[0]
# 法庭
ft = report.xpath('td[2]/font/text()')[0]
# 开庭时间
fTime = report.xpath('td[3]/text()')[0]
print(fy,ft,fTime)
6、项目交易
流程:a.获取总页数 b.获取项目信息
import requests
import lxml
from lxml import etree
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
def getPageTotal():
'''
获取总页数
:return: 总页数 Int
'''
url = "http://bbs.pinggu.org/prj/list/-4_1.html"
response = requests.get(url,headers=headers).content.decode('gbk')
mytree = lxml.etree.HTML(response)
# / html / body / div[11] / div[29] / a[11]
pageTotal = mytree.xpath('//*[@class="liebiao_fanye"]/a[last()]/text()')[0]
pageTotal = pageTotal.replace('...','').strip()
# print(pageTotal)
return int(pageTotal)
def getDetailInfo(url):
'''
获取项目详细信息
:param url:
:return:
'''
response = requests.get(url, headers=headers).content.decode('gbk')
mytree = lxml.etree.HTML(response)
# /html/body/div[11]/div/div[2]/div[1]/ul
projectList = mytree.xpath('//*[@class="ul-xiangq"]')[0]
# 关键词
pkey = projectList.xpath('./li[2]/span[2]/text()')
if pkey:
pkey = pkey[0]
else:
pkey = ''
# 发布人
pPublicer = projectList.xpath('./li[3]/span[2]/text()')[0]
# 项目描述
contents = projectList.xpath('./li[6]/span[2]/text()')
content = ''
if contents:
for c in contents:
content += c.strip()
print(pkey,pPublicer,content)
return (pkey,pPublicer,content)
def getProjectInfo(pageTotal):
'''
获取每页的项目信息
:param pageTotal:
:return:
'''
for i in range(1,pageTotal+1):
url = "http://bbs.pinggu.org/prj/list/-4_%d.html" % i
response = requests.get(url,headers=headers).content.decode('gbk')
mytree = lxml.etree.HTML(response)
projectList = mytree.xpath('//div[@class="liebiao_tiao"]')
# print(projectList)
for project in projectList:
# 编号
pId = project.xpath('.//div[@class="guding"]/a/text()')[0]
# 名称
pName = project.xpath('.//div[@class="ming"]/a/text()')[0]
# url
pUrl = 'http://bbs.pinggu.org'+project.xpath('.//div[@class="guding"]/a/@href')[0]
# 获取项目详细信息
pDetail = getDetailInfo(pUrl)
# 价格
pPrice = project.xpath('.//div[@class="jiage"]/text()')[0]
# 竞标人数
pPNum = project.xpath('.//div[@class="renshu"]/text()')[0]
# 发布时间
pTime = project.xpath('.//div[@class="shijian"]/text()')[0]
# print(pId,pName,pUrl,pPrice,pPNum,pTime)
if __name__ == '__main__':
# 获取总页数
pageTotal = getPageTotal()
# 获取项目信息
getProjectInfo(pageTotal)
后记
【后记】为了让大家能够轻松学编程,我创建了一个公众号【轻松学编程】,里面有让你快速学会编程的文章,当然也有一些干货提高你的编程水平,也有一些编程项目适合做一些课程设计等课题。
也可加我微信【1257309054】,拉你进群,大家一起交流学习。
如果文章对您有帮助,请我喝杯咖啡吧!
公众号
关注我,我们一起成长~~