request,正则爬虫


# def division(a, b):
#     # write your code here
#     try:
#       c = int((a + b)/(a - b))
#       return('a={0},b={1},(a + b)/(a - b)={2}'.format(a,b,c))
#     except ZeroDivisionError as e:
#        return('a = {0}, b ={1}, the denominator of (a+b)/(a-b) cannot be zero'.format(a,b))

# a=division(5,5)
# print(a)


"""
第四题答案
判断闰年
year = input('')
year = int(year)
if year % 4 == 0 and year % 100 != 0:
    print('is a leap year')
elif year % 400 == 0:
    print('is a leap year')
else:
    print('not a leap year')

"""

# 第二题答案
# argv = [1,2,3]

# import sys

# name = str(argv[1])
# SMS_verification_code = str(argv[2])

# # print "Hello, name! Your validation code is SMS_verification_code
# # please keep it in secret."
# print("Hello, {0}! Your validation code is {1}, please keep it in secret".format(name,SMS_verification_code))

# 如果不能实现功能需求的代码
# 标注下 才能在以后更好的去实现它

# try:
#     a = float(input('请输入被除数:'))
#     b = float(input('请输入除数:'))
#     c = a/b
#     print('商为:',c)
# except ZeroDivisionError:
#     print('除数不能为0!')


# 什么是爬虫 网页又有些什么
# 学习一个知识的时候 你想过怎么去实现它么 还是只记得死记硬背


# 爬虫初试 爬取搜狗的首页并保存到文件中
# import requests
# if __name__ == '__main__':
#     #step1 指定url
#     url = 'https://www.sogou.com/'
#     #step2 发起请求
#     #get方法会返回一个响应对象
#     response = requests.get(url = url)
#    # response = requests.get(url = url)
#     #step3 获取响应数据,text返回的是字符串形式的响应数据
#     page_text = response.text # 返回的是字符串的形式
#     print(page_text)
#     #step4 持久化存储
#     with open('./sogou.html','w',encoding = 'utf-8') as fp: # 看来就算爬取到了网站,也未必可以加载出来
#         fp.write(page_text)
#     print('爬取数据结束!')


# #UA伪装:让爬虫对应的请求载体身份标识伪装成某一款浏览器,躲过UA检测
# import requests
# if __name__ == '__main__':
#     #UA伪装:将对应的User-Agent封装到一个字典中
#     headers = {
#     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
#     }
#     # headers = {''}
#     #step1 指定url query
#     url = 'https://www.sogou.com/web'
#     #处理url携带的参数 封装到字典中
#     kw = input('Enter a word:')
#     param ={
#         'query':kw # query指定的参数,kw输入进去的参数;类似的query的查找可在网页上找
#     }
#     #step2 对指定的url发起请求,对应的url是携带参数的,并且处理过程中处理了参数
#     response = requests.get(url = url,params = param,headers = headers)
#     #step3
#     page_text = response.text
#     #step4
#     fileName = kw + '.html'
#     with open(fileName,'w',encoding ='utf-8') as fp:
#         fp.write(page_text)
#     print(fileName,'保存成功!!')



#- post请求(携带了参数)
# 响应数据是一组json数据
# import requests
# import json
# if __name__ == '__main__':
#     #step1 指定URL
#     post_url = 'https://fanyi.baidu.com/sug' # 他们是怎么知道最后后缀的
    
#     #step2 进行UA伪装
#     headers = {
#     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
#     }
    
#     #step3 post请求参数处理(同get请求类似)
#     word = input('Enter a word:\n')
#     data = {
#         'kw':word
#     }
    
#     #step4 请求发送 url data headers; get请求:url parms headers
#     response = requests.post(url = post_url,data = data,headers = headers)
    
#     #step5 获取响应数据:json()方法返回的是obj  (如果确认响应数据是json类型-->通过Content-Type分辨,才可以直接用json方法)
#     dict_obj = response.json()
#     print(dict_obj)
    
#     #step6 持久化存储
#     fileName = word + '.json'
#     fp = open(fileName,'w',encoding='utf-8')
#     json.dump(dict_obj,fp = fp)# .dump是字典转化为json么  json.dump()用于将python对象转换为字符串并且写入文件
#     # dump一般有3个参数 第一个json对象,第二个fp=文件名
#     fp.close()
#     print('Over!')

#按照上面模仿背着写个post请求
# 赶紧回忆 参数data dump 

# import json
# import requests


# url = 'https://fanyi.baidu.com/sug'
# headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'}
# kw = input('请输入你想翻译的话:')
# data = {
#     'kw':kw
# }

# response = requests.post(headers=headers,data=data,url=url)
# re_json = response.json()

# fp = open(kw+'.json','w',encoding="utf-8")
# json.dump(re_json,fp=fp) # indent自动排版, indent= True
# # 因为json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False: , ensure_ascii=False
# fp.close()



# 爬取豆瓣电影
# import requests
# import json
# if __name__ == '__main__':
#     url = 'https://movie.douban.com/j/chart/top_list' # 这个表本身就是个空字符串 如何查找这些嵌在网站里的请求,通过链接的方式可以直接达到目的
#     param = {
#         'type':'24',
#         'interval_id':'100:90', # 这个请求参数有点东西
#         'action':'',
#         'start':'0',	#从库中的第几部电影去取
#         'limit':'20'	#一次取出的个数
#     }
#     headers = {
#     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
#     }
#     # print(url+param) # must be str, not dict 只有str才能与str相加
#     response = requests.get(url = url,params = param,headers = headers)
#     list_data = response.json()
#     fp = open('./douban.json','w',encoding = 'utf-8')
#     json.dump(list_data,fp = fp,ensure_ascii = False,indent=True)
#     print('Over!')


# 返回一个城市的肯德基饭店
# import requests
# import json

# if __name__ == '__main__':
#     post_url = 'https://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword' # 又是特定的url,是咋弄来的
#     keyword = input('请输入要查询的城市:') # 景德镇的肯德基饭店都不返回

#     data ={
#         'cname': '',
#         'pid': '',
#         'keyword': keyword,
#         'pageindex': '1',
#         'pageSize': '10'
#     }
#     headers = {
#     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
#     }
#     response = requests.post(url = post_url, data = data, headers = headers)

#     # 持久化存储
#     # page_text = response.text
#     # fileName = keyword + '.html'
#     # with open(fileName, 'w', encoding= 'utf-8') as fp:
#     #     fp.write(page_text)
#     # print(fileName, 'Over!')

#     # 直接打印出来
#     page = response.json()
#     for dict in page['Table1']: # 这个Table1参数又有啥用
#         StoreName = dict['storeName']
#         address = dict['addressDetail']
#         print('StoreName:' + StoreName, 'address:' + address + '\n')


# 域名和id值拼接出一个完整的企业对应的详情页的url
# id值可以从首页对应的 ajax 请求到的 json 串中获取
# - url的域名都是一样的,只有携带的参数(id)不一样
# 首页中对应的企业信息是通过 ajax 动态请求到的
# import requests
# import json

# if __name__ == '__main__':
#     headers = {
#         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
#     }
#     id_list = []  # 存储企业的id
#     all_data_list = []  # 存储企业所有的详情数据
#     # 批量获取不同企业的id值
#     url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList' # 这个url又是咋找的
#     # 参数的封装
#     for page in range(1, 11):
#         page = str(page)
#         data = {
#             'on': 'true',
#             'page': page,
#             'pageSize': '15',
#             'productName': '',
#             'conditionType': '1',
#             'applyname': '',
#             'applysn': '',
#         }
#     json_ids = requests.post(url=url, headers=headers, data=data).json()
#     # 从 json_ids 字典中拿到 list 对应的 value 值,对 value 值列表进行遍历
#     for dic in json_ids['list']:
#         id_list.append(dic['ID'])
#     # print(id_list,'\n')

#     # 获取企业详情数据,也是动态加载出来的,携带一个参数 id,其值可以通过前一步生成的 id列表提取
#     post_url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'
#     for id in id_list:
#         data = {
#         'id': id
#         }

#         json_detail = requests.post(url=post_url, data=data, headers=headers).json()
#         #print(json_detail, '-------------END----------')
#         all_data_list.append(json_detail )
#         all_data_list.append('---------------------------------------------------------')


#     # 持久化存储all_data_list
#     fp = open('./allData.json', 'w', encoding='utf-8')
#     json.dump(all_data_list, fp=fp, ensure_ascii=False, indent= True)  # indent 自动排版
#     print('Over!')



# 正则表达式爬取页面


# 需求:爬取糗事百科中糗图板块下所有的糗图图片
# '''<div class="thumb">
# <a href="/article/124098472" target="_blank">
# <img src="//pic.qiushibaike.com/system/pictures/12409/124098472/medium/HSN2WWN0TP1VUPNG.jpg" alt="糗事#124098472" class="illustration" width="100%" height="auto">
# </a>
# </div>'''
# import re
# import os
# import requests

# if __name__ == '__main__':
#     # 创建一个文件夹,保存所有的图片
#     if not os.path.exists('./qiutuLibs'):
#         os.mkdir('./qiutuLibs')

#     url = 'https://www.qiushibaike.com/imgrank/ '
#     headers = {
#         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
#     }
#     # 使用通用爬虫对url对应的一整张页面进行爬取
#     page_text = requests.get(url=url, headers=headers).text
#     #print(page_text)

#     #使用聚焦爬虫将页面中所有的糗图进行解析提取
#     ex = '<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>' # div里面有2个匹配的对象 src=('.*?') alt=".*?"
#     # 我觉得这个()里面才是它返回的对象,果然他会只会迭代出括号里的字符串,它()外还加了''
#     img_src_list = re.findall(ex, page_text, re.S) # 搜搜字符串,以列表类型返回全部能匹配的子串
#     # re.S使.匹配包括换行在内的所有字符    
#     print(img_src_list) #findall只匹配对应的跟url相匹配的字符串 这个url不对劲啊
#     for src in img_src_list:
#         #拼接出完整的图片url
#         src = 'https:' + src
#         print('单个url的样子:')
#         img_data = requests.get(url = src, headers = headers).content
#         #生成图片名称
#         img_name = src.split('/')[-1] # 这里应该对应的是中文名 split以固定的字符串分割字符串,并以索引进行提取
#         imgPath = './qiutuLibs/' + img_name
#         with open(imgPath, 'wb') as fp:
#             fp.write(img_data)
#         print(img_name, '下载成功!')

# import os
# import re
# import requests
# if not os.path.exists('./qiutuLibs'):
#     os.mkdir('./qiutuLibs')

# url = 'https://www.qiushibaike.com/imgrank/' # 'https://www.qiushibaike.com/imgrank/ '

# headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'}

# # 提取图片的url
# '''<div class="thumb">
# <a href="/article/124098472" target="_blank">
# <img src="//pic.qiushibaike.com/system/pictures/12409/124098472/medium/HSN2WWN0TP1VUPNG.jpg" alt="糗事#124098472" class="illustration" width="100%" height="auto">
# </a>
# </div>'''
# response = requests.get(url = url, headers = headers).text
# print(response)
# img_re = '<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>'# img不要写闭包>  # 正则匹配外面用小括号,里面的匹配项class = ""还有需匹配提取的
# # 一定要用大括号 '<div class="thumb">.*?<img src= "(.*?)" alt=.?*></div>' 
# # '<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>'
# # 这个正则写好容易错,该不该加''
# img_list = re.findall(img_re, response, re.S)  # img_list = re.findall(img_re, response, re.S) 这句话匹配为空
# print(img_list)
# # re.S用于匹配换行
# print('这下面都不运行了')
# for i in img_list:
#     url_img = 'https:' + i # 这里没有加https:没加:
#     img_content = requests.get(url=url_img,headers=headers).content
#     img_name = i.split('/')[-1] # split分割字符串
#     img_path = './qiutuLibs/' + img_name # 表示上一级目录的文件夹
#     with open(img_path,'wb') as fp: # 加了encoding='utf-8' TypeError: write() argument must be str, not bytes
#         # 要以二进制形式写入
#         fp.write(img_content)
#     print('下载成功')




# 对上述代码进行进一步处理,使得能够分页爬取图片
# import re
# import os
# import requests

# if __name__ == '__main__':
#     # 创建一个文件夹,保存所有的图片
#     if not os.path.exists('./qiutuLibs'):
#         os.mkdir('./qiutuLibs')
#     # 设置一个通用的url模板
#     url = 'https://www.qiushibaike.com/imgrank/page/%d/'
#     for pageNum in range(1, 11):
#         # 对应页码的 url
#         new_url = format(url % pageNum)
#         headers = {
#         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
#         }
#         # 使用通用爬虫对url对应的一整张页面进行爬取
#         page_text = requests.get(url=new_url, headers=headers).text # 这个paga_text应该是多个多个文档的集合,那么它的类型是
#         print(type(page_text)) # <class 'str'>

#         #使用聚焦爬虫将页面中所有的糗图进行解析提取
#         ex = '<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>'

#         img_src_list = re.findall(ex, page_text, re.S)
#         print(img_src_list)
#         for src in img_src_list:
#             #拼接出完整的图片url
#             src = 'https:' + src
#             img_data = requests.get(url = src, headers = headers).content
#             #生成图片名称
#             img_name = src.split('/')[-1]
#             imgPath = './qiutuLibs/' + img_name
#             with open(imgPath, 'wb') as fp:
#                 fp.write(img_data)
#             print(img_name, '下载成功!')








posted @ 2021-07-03 11:17  索匣  阅读(79)  评论(0编辑  收藏  举报