爬虫实现案例
import requests from lxml import etree from 爬虫.old_boy.p3 import get_code_text session = requests.session() # session的作用与requests的作用几乎一样,都可以请求的发送,并且请求发送的方式也是一致的 # session进行请求的发送,如果产生cookie的话会自动保存 url = 'http://www.renren.com/' headers = { 'Use-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', 'Connection': 'close', } response = session.get(url=url, headers=headers).content xpath_data = etree.HTML(response) pic = xpath_data.xpath('//*[@id="verifyPic_login"]/@src')[0] print(pic) pic = requests.get(url=pic, headers=headers).content with open('pic.jpg', 'wb') as fp: fp.write(pic) # 获取result result = get_code_text('pic.jpg') # print(result) login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019331853198 ' data = { 'captcha_type': 'web_login', 'domain': 'renren.com', 'email': '18744585483', 'f': 'http%3A%2F%2Fwww.renren.com%2F970459497', 'icode': result, 'key_id': '1', 'origURL': 'http://www.renren.com/home', 'password': '9722733e821526e5879a37d439f40666e1af794712cad1fce23d83f7b2f57041', 'rkey': '0de33e22f20835059cb6b28da4bffdc9' } # 进行登录 response = session.post(url=login_url, headers=headers, data=data) # 对登录成功的当前用户进行详情页发送访问 detail_url = 'http://www.renren.com/970459497' # 该请求使用的是session对象 ren_response = session.get(url=detail_url, headers=headers).content with open('./renren.html', 'wb') as fp: fp.write(ren_response)
import requests from lxml import etree from 爬虫.old_boy.p3 import get_code_text url = 'https://so.gushiwen.org/user/login.aspx?from=' headers = { 'Use-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } proxies = { 'http': '193.68.135.125:59278' } session = requests.session() response = session.get(url=url, headers=headers, verify=False, proxies=proxies).content xpath_data = etree.HTML(response) pic_src = 'https://so.gushiwen.org' + str(xpath_data.xpath('//*[@id="imgCode"]/@src')[0]) # print(pic_src) pic = session.get(url=pic_src, headers=headers, verify=False, proxies=proxies).content with open('pic.jpg', 'wb') as fp: fp.write(pic) code = get_code_text('pic.jpg') print(code) post_url = 'https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx' data = { "__VIEWSTATE": "ahdYeAQW0HtfdBdmYQKvu1cIOsMVQy6b8+Tl3fFmuwmB//7WZsi1kJXIrAcqfvRP5UVTbb74NTJ389/H6FgBc60xjuUtXmCu6V15vp7reQ3DjcBq01LPXOubOG8=", "__VIEWSTATEGENERATOR": "C93BE1AE", "from:": "http://so.gushiwen.org/user/collect.aspx", "email": "862032955@qq.com", "pwd": "123456", "code": code, "denglu": "登录", } session.post(url=post_url, headers=headers, data=data, verify=False, proxies=proxies) detail_url = 'https://so.gushiwen.org/user/collect.aspx' d_response = session.get(url=detail_url, verify=False, headers=headers, proxies=proxies).content with open('古诗文.html', 'wb') as fp: fp.write(d_response)
import requests from lxml import etree import re from multiprocessing.dummy import Pool import random url = 'https://www.pearvideo.com/category_8' headers = { 'Use-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } response = requests.get(url=url, headers=headers, verify=False).content.decode() xpath_data = etree.HTML(response) li_list = xpath_data.xpath('//*[@id="listvideoListUl"]/li') # 实现并发建立的数据池 video_url_list = [] for li in li_list: # print(li) v_href = 'https://www.pearvideo.com/' + li.xpath('.//div[@class="vervideo-bd"]/a/@href')[0] # print(v_href) d_response = requests.get(url=v_href, headers=headers).content.decode() video_url = re.findall('srcUrl="(.*?)",', d_response, re.S)[0] video_url_list.append(video_url) # print(video_url) # 常见5个线程 pool = Pool(5) dowmloadVideo = lambda link: requests.get(url=link, headers=headers).content # map返回的列表中存储的就是下载完毕的视频二进制数据值 video_url_list = pool.map(dowmloadVideo, video_url_list) def save_video(data): i = random.randint(1, 1000) video_name = 'video/' + str(i) + '.mp4' # i = i + 1 with open(video_name, 'wb') as fp: fp.write(data) pool.map(save_video, video_url_list) pool.close() pool.join()