request,正则爬虫
# def division(a, b):
# # write your code here
# try:
# c = int((a + b)/(a - b))
# return('a={0},b={1},(a + b)/(a - b)={2}'.format(a,b,c))
# except ZeroDivisionError as e:
# return('a = {0}, b ={1}, the denominator of (a+b)/(a-b) cannot be zero'.format(a,b))
# a=division(5,5)
# print(a)
"""
第四题答案
判断闰年
year = input('')
year = int(year)
if year % 4 == 0 and year % 100 != 0:
print('is a leap year')
elif year % 400 == 0:
print('is a leap year')
else:
print('not a leap year')
"""
# 第二题答案
# argv = [1,2,3]
# import sys
# name = str(argv[1])
# SMS_verification_code = str(argv[2])
# # print "Hello, name! Your validation code is SMS_verification_code
# # please keep it in secret."
# print("Hello, {0}! Your validation code is {1}, please keep it in secret".format(name,SMS_verification_code))
# 如果不能实现功能需求的代码
# 标注下 才能在以后更好的去实现它
# try:
# a = float(input('请输入被除数:'))
# b = float(input('请输入除数:'))
# c = a/b
# print('商为:',c)
# except ZeroDivisionError:
# print('除数不能为0!')
# 什么是爬虫 网页又有些什么
# 学习一个知识的时候 你想过怎么去实现它么 还是只记得死记硬背
# 爬虫初试 爬取搜狗的首页并保存到文件中
# import requests
# if __name__ == '__main__':
# #step1 指定url
# url = 'https://www.sogou.com/'
# #step2 发起请求
# #get方法会返回一个响应对象
# response = requests.get(url = url)
# # response = requests.get(url = url)
# #step3 获取响应数据,text返回的是字符串形式的响应数据
# page_text = response.text # 返回的是字符串的形式
# print(page_text)
# #step4 持久化存储
# with open('./sogou.html','w',encoding = 'utf-8') as fp: # 看来就算爬取到了网站,也未必可以加载出来
# fp.write(page_text)
# print('爬取数据结束!')
# #UA伪装:让爬虫对应的请求载体身份标识伪装成某一款浏览器,躲过UA检测
# import requests
# if __name__ == '__main__':
# #UA伪装:将对应的User-Agent封装到一个字典中
# headers = {
# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
# }
# # headers = {''}
# #step1 指定url query
# url = 'https://www.sogou.com/web'
# #处理url携带的参数 封装到字典中
# kw = input('Enter a word:')
# param ={
# 'query':kw # query指定的参数,kw输入进去的参数;类似的query的查找可在网页上找
# }
# #step2 对指定的url发起请求,对应的url是携带参数的,并且处理过程中处理了参数
# response = requests.get(url = url,params = param,headers = headers)
# #step3
# page_text = response.text
# #step4
# fileName = kw + '.html'
# with open(fileName,'w',encoding ='utf-8') as fp:
# fp.write(page_text)
# print(fileName,'保存成功!!')
#- post请求(携带了参数)
# 响应数据是一组json数据
# import requests
# import json
# if __name__ == '__main__':
# #step1 指定URL
# post_url = 'https://fanyi.baidu.com/sug' # 他们是怎么知道最后后缀的
# #step2 进行UA伪装
# headers = {
# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
# }
# #step3 post请求参数处理(同get请求类似)
# word = input('Enter a word:\n')
# data = {
# 'kw':word
# }
# #step4 请求发送 url data headers; get请求:url parms headers
# response = requests.post(url = post_url,data = data,headers = headers)
# #step5 获取响应数据:json()方法返回的是obj (如果确认响应数据是json类型-->通过Content-Type分辨,才可以直接用json方法)
# dict_obj = response.json()
# print(dict_obj)
# #step6 持久化存储
# fileName = word + '.json'
# fp = open(fileName,'w',encoding='utf-8')
# json.dump(dict_obj,fp = fp)# .dump是字典转化为json么 json.dump()用于将python对象转换为字符串并且写入文件
# # dump一般有3个参数 第一个json对象,第二个fp=文件名
# fp.close()
# print('Over!')
#按照上面模仿背着写个post请求
# 赶紧回忆 参数data dump
# import json
# import requests
# url = 'https://fanyi.baidu.com/sug'
# headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'}
# kw = input('请输入你想翻译的话:')
# data = {
# 'kw':kw
# }
# response = requests.post(headers=headers,data=data,url=url)
# re_json = response.json()
# fp = open(kw+'.json','w',encoding="utf-8")
# json.dump(re_json,fp=fp) # indent自动排版, indent= True
# # 因为json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False: , ensure_ascii=False
# fp.close()
# 爬取豆瓣电影
# import requests
# import json
# if __name__ == '__main__':
# url = 'https://movie.douban.com/j/chart/top_list' # 这个表本身就是个空字符串 如何查找这些嵌在网站里的请求,通过链接的方式可以直接达到目的
# param = {
# 'type':'24',
# 'interval_id':'100:90', # 这个请求参数有点东西
# 'action':'',
# 'start':'0', #从库中的第几部电影去取
# 'limit':'20' #一次取出的个数
# }
# headers = {
# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
# }
# # print(url+param) # must be str, not dict 只有str才能与str相加
# response = requests.get(url = url,params = param,headers = headers)
# list_data = response.json()
# fp = open('./douban.json','w',encoding = 'utf-8')
# json.dump(list_data,fp = fp,ensure_ascii = False,indent=True)
# print('Over!')
# 返回一个城市的肯德基饭店
# import requests
# import json
# if __name__ == '__main__':
# post_url = 'https://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword' # 又是特定的url,是咋弄来的
# keyword = input('请输入要查询的城市:') # 景德镇的肯德基饭店都不返回
# data ={
# 'cname': '',
# 'pid': '',
# 'keyword': keyword,
# 'pageindex': '1',
# 'pageSize': '10'
# }
# headers = {
# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
# }
# response = requests.post(url = post_url, data = data, headers = headers)
# # 持久化存储
# # page_text = response.text
# # fileName = keyword + '.html'
# # with open(fileName, 'w', encoding= 'utf-8') as fp:
# # fp.write(page_text)
# # print(fileName, 'Over!')
# # 直接打印出来
# page = response.json()
# for dict in page['Table1']: # 这个Table1参数又有啥用
# StoreName = dict['storeName']
# address = dict['addressDetail']
# print('StoreName:' + StoreName, 'address:' + address + '\n')
# 域名和id值拼接出一个完整的企业对应的详情页的url
# id值可以从首页对应的 ajax 请求到的 json 串中获取
# - url的域名都是一样的,只有携带的参数(id)不一样
# 首页中对应的企业信息是通过 ajax 动态请求到的
# import requests
# import json
# if __name__ == '__main__':
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
# }
# id_list = [] # 存储企业的id
# all_data_list = [] # 存储企业所有的详情数据
# # 批量获取不同企业的id值
# url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList' # 这个url又是咋找的
# # 参数的封装
# for page in range(1, 11):
# page = str(page)
# data = {
# 'on': 'true',
# 'page': page,
# 'pageSize': '15',
# 'productName': '',
# 'conditionType': '1',
# 'applyname': '',
# 'applysn': '',
# }
# json_ids = requests.post(url=url, headers=headers, data=data).json()
# # 从 json_ids 字典中拿到 list 对应的 value 值,对 value 值列表进行遍历
# for dic in json_ids['list']:
# id_list.append(dic['ID'])
# # print(id_list,'\n')
# # 获取企业详情数据,也是动态加载出来的,携带一个参数 id,其值可以通过前一步生成的 id列表提取
# post_url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'
# for id in id_list:
# data = {
# 'id': id
# }
# json_detail = requests.post(url=post_url, data=data, headers=headers).json()
# #print(json_detail, '-------------END----------')
# all_data_list.append(json_detail )
# all_data_list.append('---------------------------------------------------------')
# # 持久化存储all_data_list
# fp = open('./allData.json', 'w', encoding='utf-8')
# json.dump(all_data_list, fp=fp, ensure_ascii=False, indent= True) # indent 自动排版
# print('Over!')
# 正则表达式爬取页面
# 需求:爬取糗事百科中糗图板块下所有的糗图图片
# '''<div class="thumb">
# <a href="/article/124098472" target="_blank">
# <img src="//pic.qiushibaike.com/system/pictures/12409/124098472/medium/HSN2WWN0TP1VUPNG.jpg" alt="糗事#124098472" class="illustration" width="100%" height="auto">
# </a>
# </div>'''
# import re
# import os
# import requests
# if __name__ == '__main__':
# # 创建一个文件夹,保存所有的图片
# if not os.path.exists('./qiutuLibs'):
# os.mkdir('./qiutuLibs')
# url = 'https://www.qiushibaike.com/imgrank/ '
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
# }
# # 使用通用爬虫对url对应的一整张页面进行爬取
# page_text = requests.get(url=url, headers=headers).text
# #print(page_text)
# #使用聚焦爬虫将页面中所有的糗图进行解析提取
# ex = '<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>' # div里面有2个匹配的对象 src=('.*?') alt=".*?"
# # 我觉得这个()里面才是它返回的对象,果然他会只会迭代出括号里的字符串,它()外还加了''
# img_src_list = re.findall(ex, page_text, re.S) # 搜搜字符串,以列表类型返回全部能匹配的子串
# # re.S使.匹配包括换行在内的所有字符
# print(img_src_list) #findall只匹配对应的跟url相匹配的字符串 这个url不对劲啊
# for src in img_src_list:
# #拼接出完整的图片url
# src = 'https:' + src
# print('单个url的样子:')
# img_data = requests.get(url = src, headers = headers).content
# #生成图片名称
# img_name = src.split('/')[-1] # 这里应该对应的是中文名 split以固定的字符串分割字符串,并以索引进行提取
# imgPath = './qiutuLibs/' + img_name
# with open(imgPath, 'wb') as fp:
# fp.write(img_data)
# print(img_name, '下载成功!')
# import os
# import re
# import requests
# if not os.path.exists('./qiutuLibs'):
# os.mkdir('./qiutuLibs')
# url = 'https://www.qiushibaike.com/imgrank/' # 'https://www.qiushibaike.com/imgrank/ '
# headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'}
# # 提取图片的url
# '''<div class="thumb">
# <a href="/article/124098472" target="_blank">
# <img src="//pic.qiushibaike.com/system/pictures/12409/124098472/medium/HSN2WWN0TP1VUPNG.jpg" alt="糗事#124098472" class="illustration" width="100%" height="auto">
# </a>
# </div>'''
# response = requests.get(url = url, headers = headers).text
# print(response)
# img_re = '<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>'# img不要写闭包> # 正则匹配外面用小括号,里面的匹配项class = ""还有需匹配提取的
# # 一定要用大括号 '<div class="thumb">.*?<img src= "(.*?)" alt=.?*></div>'
# # '<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>'
# # 这个正则写好容易错,该不该加''
# img_list = re.findall(img_re, response, re.S) # img_list = re.findall(img_re, response, re.S) 这句话匹配为空
# print(img_list)
# # re.S用于匹配换行
# print('这下面都不运行了')
# for i in img_list:
# url_img = 'https:' + i # 这里没有加https:没加:
# img_content = requests.get(url=url_img,headers=headers).content
# img_name = i.split('/')[-1] # split分割字符串
# img_path = './qiutuLibs/' + img_name # 表示上一级目录的文件夹
# with open(img_path,'wb') as fp: # 加了encoding='utf-8' TypeError: write() argument must be str, not bytes
# # 要以二进制形式写入
# fp.write(img_content)
# print('下载成功')
# 对上述代码进行进一步处理,使得能够分页爬取图片
# import re
# import os
# import requests
# if __name__ == '__main__':
# # 创建一个文件夹,保存所有的图片
# if not os.path.exists('./qiutuLibs'):
# os.mkdir('./qiutuLibs')
# # 设置一个通用的url模板
# url = 'https://www.qiushibaike.com/imgrank/page/%d/'
# for pageNum in range(1, 11):
# # 对应页码的 url
# new_url = format(url % pageNum)
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
# }
# # 使用通用爬虫对url对应的一整张页面进行爬取
# page_text = requests.get(url=new_url, headers=headers).text # 这个paga_text应该是多个多个文档的集合,那么它的类型是
# print(type(page_text)) # <class 'str'>
# #使用聚焦爬虫将页面中所有的糗图进行解析提取
# ex = '<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>'
# img_src_list = re.findall(ex, page_text, re.S)
# print(img_src_list)
# for src in img_src_list:
# #拼接出完整的图片url
# src = 'https:' + src
# img_data = requests.get(url = src, headers = headers).content
# #生成图片名称
# img_name = src.split('/')[-1]
# imgPath = './qiutuLibs/' + img_name
# with open(imgPath, 'wb') as fp:
# fp.write(img_data)
# print(img_name, '下载成功!')
努力拼搏吧,不要害怕,不要去规划,不要迷茫。但你一定要在路上一直的走下去,尽管可能停滞不前,但也要走。