一.cookie应用实例
import urllib.request import urllib.parse '''带着cookie进入人人网的用户管理界面: 1.用浏览器登录人人网 2.下次请求时,抓包,拿到它带着的cookie 3.编写代码,带着cookie过去 4.如果不行,带着所有的请求信息过去(终极方案)''' url = 'http://www.renren.com/971302264/profile' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36', 'Cookie': 'anonymid=jxczgs3yw3oby9; _de=9718742970B17AD7ABC87CAAA6A740CC;' ' p=176166a1bb4a1d1a163443225f52e24e4; first_login_flag=1; ln_uact=18404904721; ' 'ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; ' 't=21d77ab67402235d4282cf725f991aab4; societyguester=21d77ab67402235d4282cf725f991aab4; ' 'id=971302264; xnsid=6d1019cd; ver=7.0; loginfrom=null; JSESSIONID=abcOB4RHNlyeq8Dv_7sUw; ' 'jebe_key=2819f31f-79cc-428e-b61e-8b968e2beda4%7C920b82268747e02c45f3056eeda651c7%7C1561538325515%7C1%7C1561538325729; ' 'jebe_key=2819f31f-79cc-428e-b61e-8b968e2beda4%7C920b82268747e02c45f3056eeda651c7%7C1561538325515%7C1%7C1561538325732; wp_fold=0' } req = urllib.request.Request(url,headers=headers) rep = urllib.request.urlopen(req) with open('ren.html','wb') as fp: fp.write(rep.read())
二、编程登录人人网
import urllib.request import urllib.parse import http.cookiejar '''python登录人人网: 1.用浏览器登录并抓包 2.拿到目标url和post信息 3.带着这些信息发请求''' '''创建这样的打开器,登录时会保存cookie信息到该打开器''' cj = http.cookiejar.CookieJar() #创建CookieJar对象 handler = urllib.request.HTTPCookieProcessor(cj) #创建cookie处理者 opener = urllib.request.build_opener(handler) #创建打开器 post_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019531649636 ' form_data = {'email':'18404904721', 'icode':'', 'origURL':'http://www.renren.com/home', 'domain':'renren.com', 'key_id':'1', 'captcha_type':'web_login', 'password':'641fd8bce69ff3a3acfb14fc094fefe9487f9b4f843d18063fcce22e0a468066', 'rkey':'2c3ae276413c03a1eb5159d355895bd0', 'f':'http%3A%2F%2Fwww.renren.com%2F971302264%2Fprofile'} headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',} form_data = urllib.parse.urlencode(form_data).encode() #post表单处理 req = urllib.request.Request(url=post_url,headers=headers) #创建请求对象 rep = opener.open(req,data=form_data) #发送post请求 # print(rep.read().decode()) '''进入用户管理界面,验证是否登录成功''' get_url = 'http://www.renren.com/971302264/profile' req1 = urllib.request.Request(url=get_url,headers=headers) rep1 = opener.open(req1) #再次使用该打开器,里面会带着cookie with open('guanli.html','wb') as fp: fp.write(rep1.read())
三、正则表达式提取内容
import re '''()子模式''' # string = '<div><span>悟空</span></div>' # '''匹配上面的字符串,标签是对称的''' # pattern = re.compile(r'<(\w+)><(\w+)>\w+</\2></\1>') # ret = pattern.search(string) # print(ret) '''贪婪与非贪婪''' # string = '<div>八戒</div></div></div>' # '''匹配上面的字符串,标签是对称的''' # pattern1 = re.compile(r'<div>.*</div>') # pattern2 = re.compile(r'<div>.*?</div>') # ret1 = pattern1.search(string) # ret2 = pattern2.search(string) # print(ret1) # print(ret2) '''re.M多行匹配''' string = '''beautiful' beach''' pattern = re.compile(r'^bea',re.M) ret = pattern.findall(string) print(ret) '''re.S单行匹配''' # string = '<div>《沁园春-雪》' \ # '北国风光,千里冰封,万里雪飘。' \ # '望长城内外,惟余莽莽。' \ # '大河上下,顿失滔滔。</div>' # pattern = re.compile(r'.*',re.S) # ret = pattern.search(string) # print(ret) '''re.I 单忽略大小写''' # string = 'Life Is Short You Must Be Sexy' # pattern = re.compile(r'life is short you must be sexy',re.I) # ret = pattern.search(string) # print(ret) '''正则替换''' string = 'Life Is Short You Must Be Sexy' pattern = re.compile(r'Sexy') ret = re.sub(pattern,'sao',string) ret2 = pattern.sub('lang',string) print(ret) print(ret2) def func(a): ret = int(a.group()) return str(ret - 3) string = '最佳身高为175cm' pattern = re.compile(r'\d+') ret2 = pattern.sub(func,string) print(ret2)
四、正则例子-爬取糗图图片
import urllib.request import urllib.parse import re import os def create_request(url,page): post_url = url + str(page) +'/' header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'} req = urllib.request.Request(url=post_url,headers=header) return req def download_image(content): pattern = re.compile(r'<div class="thumb">.*?<img src="(.*?)" .*?>.*?</div>',re.S) img_list = pattern.findall(content) #print(img_list) for img_src in img_list: img_url = 'https:' + img_src #拼接图片链接 dirname = 'qiutu' if not os.path.exists(dirname): os.mkdir(dirname) img_name = img_url.split('/')[-1] filepath = dirname + '/' + img_name urllib.request.urlretrieve(img_url,filepath) def main(): url = 'https://www.qiushibaike.com/pic/page/' start_page = int(input('起始页码:')) end_page = int(input('结束页码:')) for page in range(start_page,end_page): print('第%s页开始下载...' %page) #创建请求 req = create_request(url,page) #发送请求,得到内容 rep = urllib.request.urlopen(req).read().decode() #解析内容,下载图片 download_image(rep) print('第%s页结束下载...' % page) if __name__ == '__main__': main()
五、正则例子-爬取语录
import urllib.request import urllib.parse import re import os def create_request(url,page=None): if page != None: url = url + str(page) + '.html' #print(post_url) header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'} req = urllib.request.Request(url=url,headers=header) return req def get_content(href): request = create_request(href) content_html = urllib.request.urlopen(request).read().decode() pattern = re.compile(r'<div class="neirong">(.*?)</div>', re.S) content_list = pattern.findall(content_html) #print(content_list) pat = re.compile(r'<img .*?>') text = pat.sub('',content_list[0]) return text def parse_html(content): #正则筛选内容 pattern = re.compile(r'<h3><a href="/mingrenjingdianyulu/(\d+/\d+/\d+\.html)"><b>(.*?)</b></a></h3>', re.S) title_list = pattern.findall(content) #print(title_list) for i in title_list: href = 'http://www.yikexun.cn/mingrenjingdianyulu/' + i[0] # 拼接内容的跳转链接 title = i[1] #向href发送请求,获取内容 content = get_content(href) #写入文件 string = '<!DOCTYPE html>' \ '<html lang="en">' \ '<head>' \ ' <meta charset="UTF-8">' \ ' <title>Title</title>' \ '</head>' \ '<body>' \ ' <h1>%s</h1>%s' \ '</body>' %(title,content) with open('yulu.html','a',encoding='utf8') as fp: fp.write(string) def main(): url = 'http://www.yikexun.cn/mingrenjingdianyulu/list_10_' start_page = int(input('起始页码:')) end_page = int(input('结束页码:')) for page in range(start_page,end_page+1): print('第%s页开始下载...' %page) #创建请求 req = create_request(url,page) #发送请求,得到内容 rep = urllib.request.urlopen(req).read().decode() #解析内容,下载图片 parse_html(rep) print('第%s页结束下载...' % page) if __name__ == '__main__': main()