爬虫实战入门

爬虫0（正则表达式查找，get）

11月15日

##获取豆瓣电影top250



import requests
import re


#首先获取url，然后用浏览器F12发现是get请求
url='https://movie.douban.com/top250'
headers={
'User-Agent':
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0'
}
i=0

for i in range(0,226,25):
    ##发送请求，拿到网页源代码
    ##把param放到这里是因为这个网页每25个电影为一页
    ##如果我想获取top250，就需要跳转10次
    param = {
        'start': i,
        'filter': ''
    }
    resp=requests.get(url,headers=headers,params=param)
    text=resp.text


    ##通过正则表达式找到名字
    obj=re.compile(r'<div class="info">.*?<span class="title">(?P<name>.*?)</span>',re.S)
    result=obj.finditer(text)
    for name in result:
        print(name.group('name'))



##记得关闭通道
resp.close()

获取豆瓣电影top250

使用request和re方法

爬虫1 （正则表达式查找，进入子链接，get）

11月16日

import requests
import re
import urllib3
headers={

}
url='https://www.dytt89.com/'
resp=requests.get(url,verify=False)
resp.encoding='gb2312'
resp_text=resp.text

obj=re.compile(r'2022必看热片.*<ul>(?P<ul>.*?)</ul>',re.S)
resule=obj.finditer(resp_text)
for it in resule:
    ul_text=it.group('ul')
son_link_list=[]
obj2=re.compile(r"href='/(?P<son_link>.*?)' title=")
result1=obj2.finditer(ul_text)
for son_link in result1:
    tem=son_link.group('son_link')
    son_link_list.append(tem)


for son_link_tem in son_link_list:
    url_tem=url+son_link_tem
    print(url_tem)
    urllib3.disable_warnings()
    resp=requests.get(url_tem,verify=False)
    resp.encoding = 'gb2312'
    resp_text=resp.text
    obj3=re.compile(r"<title>(?P<name>.*?)_电影天堂</title>",re.S)
    result_tem=obj3.finditer(resp_text)
    for name in result_tem:
        print(name.group("name"))

    obj4=re.compile(r"<tr>.*?<td style=.*?<a href=\".*?\">(?P<link_xunlei>.*?)</a>.*?</tr>.*?</tbody>",re.S)
    result_tem = obj4.finditer(resp_text)
    for link in result_tem:
        print(link.group("link_xunlei"))
        print('\n')
        break
resp.close()

采用requests库和re库

通过request发送get or post请求，得到网页源代码，然后用正则表达式搜索源代码文本，找到电影子链接并进入，拿到电影名称以及他的磁力链接

爬虫2（找到菜名，菜价 post）

（post方式）

import requests
import re

url='http://www.xinfadi.com.cn/getCat.html'
headers={
'User-Agent':
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0'

}
dw={
'prodCatid'	:"1186"
}

resp=requests.post(url,headers=headers,data=dw)
text=resp.text
obj1=re.compile(r'"prodName":"(?P<prodName>.*?)"',re.S)
obj2=re.compile(r'"lowPrice":"(?P<lowPrice>.*?)"',re.S)
obj3=re.compile(r'"highPrice":"(?P<highPrice>.*?)"',re.S)
result1=obj1.finditer(text)
result2=obj2.finditer(text)
result3=obj3.finditer(text)

for find in result1:
	print(find.group('prodName'))
for find in result2:
	print(find.group('lowPrice'))
for find in result3:
	print(find.group('highPrice'))

抓包找post

模拟发送请求得到响应，然后对响应分析

找出了菜名，菜的最低价、最高价

当然这个没排序，学会导出表格的话应该还凑活用。。

爬虫3（豆瓣top250 demo版本，采用bs4中的BeautifulSoup函数）

##采用bs4中的BeautifulSoup 通过标签来查找内容

#首先还是获得页面源代码
import requests
from bs4 import  BeautifulSoup
url='https://movie.douban.com/top250'
headers={
'User-Agent':
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0'
}
resp=requests.get(url,headers=headers)
page=BeautifulSoup(resp.text,"html.parser") ##注意这里，通过这个函数获取了page，为了后面能查标签，"html.parser"是为了告诉这个函数你前面那个是什么文件
result1=page.find("ol",class_='grid_view' )##find是找到一个就返回，find_all是全找到
result2=result1.find_all('span',class_='title')[0:]  ##找到很多的话，直接切片
for tem in result2:
	tds=tem.text #找到被标签标记的内容
	print(tds)
	print('\n',end='')


resp.close()

采用bs4库中的Beautifulsoup函数

通过标签来查找内容

爬虫4：

使用了正则表达式和BeautifulSoup

报错了

爬整个网站的图：（太爽了）

import requests
from bs4 import  BeautifulSoup
import re

##拿到页面源代码
for i in range(1,251,1):
	url_raw='http://www.netbian.com/'
	if(i!=1):
		son_url=f'index_{i}.htm'
	else:
		son_url=''
	url=url_raw+son_url
	headers={
	'User-Agent':
		'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0'
	}

	#抓取子链接
	resp=requests.get(url,headers=headers)
	resp.encoding='gbk'
	page=BeautifulSoup(resp.text,'html.parser')
	result=page.find('div',class_='list')
	obj=re.compile(r'<a href="(?P<href>.*?)"',re.S)
	href_list=obj.finditer(str(result))
	for href in href_list :
		if href.group("href")[0]!= 'h':
			chilf_href=href.group("href").strip('/')
			link=url_raw+chilf_href
			##套娃，再次进入子链接
			resp=requests.get(link)
			resp.encoding='gbk'
			page=BeautifulSoup(resp.text,'html.parser')

			result=page.find('div',class_='endpage')
			result=result.find('div',class_="pic")
			obj=re.compile(r'src="(?P<link>.*?)"',re.S)
			result_list=obj.finditer(str(result))
			for link in result_list:
				print(link.group("link"))
resp.close()

posted @ 2022-12-01 19:04 今天吃大鸡腿阅读(131) 评论(0) 编辑收藏举报

刷新页面返回顶部

Hznuxxw