[python][爬虫] 爬虫基础知识 2
# 估计 16日下午 难以完成 qunar这个 爬虫的学习了
"""
以下是该爬虫所要学习的东西
0,需求: 先找出发地,再找目的地,通过url地址栏参数的组合,来获取所有的自由行产品,这里有个重要的函数方法 就是urllib.requests.quote(),将几个参数一同post到页面
1,数据的清洗 与整理, 目前还是不太理解 query['query'] not i a # 为什么 好像也不是一个变量 到底是为什么呢
2,pool.Pool(8) 的使用
3,exception 的使用,这个是慢工,不过还是尽量积累一下
说说 pool.Pool(), 如果不加参数 默认是按照系统的cpu core的数量
import multiprocessing
import time
from multiprocessing import Pool
def run(fn):
#fn: 函数参数是数据列表的一个元素
time.sleep(1)
print(fn*fn)
if __name__ == "__main__":
testFL = [1,2,3,4,5,6]
print ('shunxu:') #顺序执行(也就是串行执行,单进程)
s = time.time()
for fn in testFL:
run(fn)
t1 = time.time()
print ("顺序执行时间:", int(t1 - s))
print ('concurrent:') #创建多个进程,并行执行
pool = Pool(10) #创建拥有10个进程数量的进程池
#testFL:要处理的数据列表,run:处理testFL列表中数据的函数
pool.map(run, testFL)
pool.close()#关闭进程池,不再接受新的进程
pool.join()#主进程阻塞等待子进程的退出
t2 = time.time()
print ("并行执行时间:", int(t2-t1))
### 第二部分 qunar 的 dep 出发地获得
import requests
url='https://touch.dujia.qunar.com/depCities.qunar'
strhtml=requests.get(url)
dep_dict=strhtml.json()
for dep_item in dep_dict['data']:
for dep in dep_dict['data'][dep_item]:
print(dep)
# 对于dict 我还是习惯 for (key:value) in dict_aaa: 这种方式,但是程序中似乎是通过 dep_item ,这个东西,就自动把 第二层的内容转化成 list了 ,我换了一个名称 叫做ddd
#for (key,value) in dep_dict['data']:
# print(key+':'+value)
# 结果报错的
for ddd in dep_dict['data']:
for dep in dep_dict['data'][ddd]:
print(dep)
# 双层的dict, 第一层 data是明文标注的, 第二层item 这个
print(type(ddd))
print(type(dep_dict['data']))
print(type(dep_dict['data'][ddd]))
print(*dep_dict['data'][ddd])
# 前三个 分别是 str, dict, list
# 最后 print 结果 z开头的地名, 只能这样想了, list 其实是变化的,从最开始 abc 开头的地名,一直在变
"""
{"ret":true,"data":{"a":["澳门","阿坝州","阿克苏地区","阿拉尔","阿拉善盟","阿勒泰","阿里","安康","安庆","鞍山","安顺","安阳"],
"b":["北京","白城","百色","白沙","白山","白银","保定","宝鸡","保山","保亭","包头","巴彦淖尔","巴音郭楞","巴中","北海","北屯市","蚌埠","本溪","毕节","滨州","博尔塔拉","亳州"],
"c":["长春","长沙","成都","重庆","沧州","常德","昌都","长葛市","昌吉","长治","常州","巢湖","朝阳市","潮州","承德","澄迈","郴州","赤峰","池州","崇左","楚雄","滁州"],"d":["大理","大连","丹东","淡水","儋州","大庆","大同","大兴安岭","达州","德宏","德阳","德州市","定安","定西","迪庆","东方","东莞","东兴","东营"],"e":["鄂尔多斯","恩施","鄂州"],"f":["福州","防城港","佛山","抚顺","阜新","阜阳","抚州"],"g":["广州","贵阳","甘南","赣州","甘孜州","广安","广元","贵港","桂林","果洛藏族自治州","固原"],"h":["哈尔滨","海口","杭州","合肥","呼和浩特","海北藏族自治州","海东地区","海南藏族自治州","海西蒙古族藏族自治州","哈密","邯郸","汉中","鹤壁","河池","鹤岗","黑河","衡水","衡阳","和田","河源","菏泽","贺州","红河","淮安","淮北","怀化","淮南","花莲","黄冈","黄南藏族自治州","黄山","黄石","惠州","葫芦岛","呼伦贝尔","湖州"],"j":["济南","佳木斯","吉安","江门","焦作","嘉兴","嘉义","嘉峪关","揭阳","吉林市","金昌","晋城","景德镇","荆门","荆州","金华","济宁","晋中","锦州","九江","酒泉","鸡西","济源"],"k":["昆明","开封","喀什","克拉玛依","克孜勒苏柯尔克孜","克孜勒苏","昆山市"],"l":["兰州","拉萨","来宾","莱芜","廊坊","乐东","乐山","凉山州","连云港","聊城","辽阳","辽源","丽江","临沧","临汾","临高","陵水","临夏","临沂","林芝","丽水","六安","六盘水","柳州","陇南","龙岩","娄底","漯河","洛阳","泸州","吕梁"],"m":["马鞍山","茂名","眉山","梅州","绵阳","牡丹江"],"n":["南昌","南京","南宁","南充","南平","南通","南投","南阳","那曲","内江","宁波","宁德","怒江"],"p":["盘锦","攀枝花","平顶山市","屏东","平凉","萍乡","普洱","普宁","莆田","濮阳"],"q":["黔东南","潜江","黔南","黔西南","青岛","庆阳","清远","秦皇岛","钦州","琼海","琼中","齐齐哈尔","七台河","泉州","曲靖","衢州"],"r":["日喀则","日照","瑞金市"],"s":["上海","沈阳","石家庄","三门峡","三明","三沙","三亚","商洛","商丘","上饶","山南","汕头","汕尾","韶关","绍兴","邵阳","神农架","深圳","石河子","十堰","石嘴山","双鸭山","朔州","四平","松原","绥化","遂宁","随州","宿迁","宿州","苏州"],"t":["台北","太原","天津","塔城地区","泰安","台中","台州","泰州","唐山","天水","铁岭","铜川","通化","通辽","铜陵","铜仁","吐鲁番","图木舒克","屯昌"],"w":["武汉","乌鲁木齐","万宁","潍坊","威海","渭南","文昌","文山","温州","乌海","芜湖","五家渠市","乌兰察布","武威","无锡","婺源","五指山","吴忠","梧州"],"x":["西安","香港","西宁","厦门","湘潭","湘西","襄阳","咸宁","仙桃","咸阳","孝感","西昌市","锡林郭勒盟","西南中沙群岛办事处","兴安盟","邢台","新乡","信阳","新余","忻州","西双版纳","宣城","许昌","徐州"],"y":["银川","雅安","延安","延边","盐城","阳江","阳泉","扬州","延吉市","烟台","宜宾","宜昌","伊春","宜春","伊犁","伊犁哈萨克自治州","营口","鹰潭","义乌市","益阳","永州","岳阳","玉林","榆林","运城","云浮","玉树藏族自治州","玉溪"],"z":["郑州","枣庄","张家界","张家口","张掖","漳州","湛江","肇庆","昭通","镇江","中山","中卫","周口","舟山","珠海","驻马店","株洲","淄博","自贡","资阳","遵义"]}}
"""
"""
https://m.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175'.format(urllib.request.quote(dep)
浏览器 输入以下地址 ,会返回一些数据
https://m.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175
{"ret":true,"data":[{"score":120,"module":"cf","title":"99嘻游季","subModules":[]},
{"score":110,"module":"rm","title":"热门","subModules":[]},
{"score":100,"module":"zb","title":"周边","subModules":[]},
{"score":90,"module":"gn","title":"国内","subModules":[]},
{"score":80,"module":"mq","title":"免签/落地签","subModules":[]},{"score":60,"module":"gat","title":"港澳台","subModules":[]},{"score":50,"module":"rh","title":"日韩","subModules":[]},{"score":40,"module":"dny","title":"东南亚","subModules":[]},{"score":30,"module":"oz","title":"欧洲","subModules":[]},{"score":20,"module":"mz","title":"美洲","subModules":[]},{"score":10,"module":"ax","title":"澳新","subModules":[]},{"score":9,"module":"zhdf","title":"中东非","subModules":[]}],"status":0}
"""
# 通过上边的数据 ,可以看到 目的地数据结构至少是三层。 data一层,subModules 一层, 第三层,暂时叫做 items
# 另外 a这个临时list 主要用于去重
dep='上海'
a=[]
url = 'https://m.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175'.format(urllib.request.quote(dep))
#time.sleep(1)
strhtml = requests.get(url)
arrive_dict = strhtml.json()
for arr_item in arrive_dict['data']:
for arr_item_1 in arr_item['subModules']:
print (arr_item_1['title'])
for query in arr_item_1['items']:
if query['query'] not in a:
a.append(query['query'])
print(*a)
三亚 丽江 厦门 哈尔滨 长白山 西双版纳 青海湖 成都 北海 广州 西安 桂林 张家界 雪乡 珠海 泰国 日本 巴厘岛 芽庄 越南 普吉岛 新加坡 塞班岛 迪拜 沙巴 土耳其 马尔代夫 清迈 柬埔寨 斯里兰卡 上海迪士尼乐园 杭州 苏州 黄山 松江雪浪湖温泉 普陀山 上海浦江源温泉农庄 乌镇 南京 天目湖御水温泉 千岛湖 常州 莫干山 嵊泗列岛 宋城 横店影视城 重庆 大理 腾冲 张掖 喀纳斯 乌鲁木齐 兰州 北京 呼伦贝尔 五台山 太原 平遥古城 天津 大连 大兴安岭 沈阳 凤凰古城 恩施 长沙 武汉 神农架 涠洲岛 长滩岛 埃及 文莱 尼泊尔 缅甸 香港 台北 澳门 高雄 垦丁公园热带风景区 花莲 北海道 冲绳 东京 大阪 名古屋 箱根 京都 福冈 济州岛 马来西亚 甲米 俄罗斯 希腊 瑞士 西班牙 莫斯科 法国 英国 波兰 美国 夏威夷 加拿大 阿根廷 洛杉矶 澳大利亚 新西兰 悉尼 皇后镇 墨尔本 凯恩斯 摩洛哥 以色列 毛里求斯 塞舌尔
# 忽然有个需求,就是有没有能够 自动遍历 list或者 dict结构的模块, 然后以图形化的方式 显示一下 总体情况
# 上边这个 目的地的数据,我已经进我所能的来展示的 ,利用了三个for 来写这段,是建立在对与环境和数据有清晰了解的
# 关于重定向问题 : 推荐两个链接吧 ,方向就是 f12 通过检查网页元素,获取 redirect 里边的 href, 以及location 这些关键字段吧
# 另外也可以 通过 scripy shell 来试试看 ,但是 我尝试了,这里获取的url 并非我想要的, 例子中qunar 这个重定向 ,相信他是仔细分析了网站和网页结构的结果
Scripy 的的解决重定向的问题
https://blog.csdn.net/Gpwner/article/details/78404192
三种方法
https://www.cnblogs.com/zhumengke/articles/9618368.html
这个脚本案例,后边还提到了 mongodb的连接和使用,如果是pymongo 连接,建库,建集合的话,相对于mysql 还更间接一些,这也许是很多爬虫er喜欢用mongo的原因之一,同时我也就不在这里耗费更多笔墨了 ,附件脚本里边4-5行代码就承担了这个功能
。今天草草收场了,也许是碰到一些难关了
# 多线程和备忘录 ,还能用用
# 页面分析,url重定向,post or urllib.requests.quote , response获取, 网站结构分析--不小的一块
# dict 和list 的空间思维 --- 看悟性
# pymongo的 使用 -- 比较简单
# scrapy的工具 功能,场景 ,案例 ----后边仔细学一下
走了看看 《今日简史》去!!
import requests import urllib import time import pymongo client=pymongo.MongoClient('127.0.0.1',27017) book_qunar=client['qunar'] sheet_qunar_zyx=book_qunar['qunar_zyx'] # # # url='https://touch.dujia.qunar.com/depCities.qunar' # strhtml=requests.get(url) # dep_dict=strhtml.json() # for dep_item in dep_dict['data']: # for dep in dep_dict['data'][dep_item]: # a = [] # print(dep) # url = 'https://m.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175'.format(urllib.request.quote(dep)) # time.sleep(1) # strhtml = requests.get(url) # arrive_dict = strhtml.json() # for arr_item in arrive_dict['data']: # for arr_item_1 in arr_item['subModules']: # for query in arr_item_1['items']: # if query['query'] not in a: # a.append(query['query']) # for item in a: # url = 'https://touch.dujia.qunar.com/list?modules=list,bookingInfo&dep={}&query={}&mtype=all&ddt=false&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=FreetripTouchin&et=FreetripTouch&date=&configDepNew=&needNoResult=true&originalquery={}&limit=0,20&includeAD=true&qsact=search'.format(urllib.request.quote(dep),urllib.request.quote(item),urllib.request.quote(item)) # time.sleep(1) # strhtml = requests.get(url) # routeCount=int(strhtml.json()['data']['limit']['routeCount']) # for limit in range(0, routeCount, 20): # url = 'https://touch.dujia.qunar.com/list?modules=list,bookingInfo&dep={}&query={}&mtype=all&ddt=false&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=FreetripTouchin&et=FreetripTouch&date=&configDepNew=&needNoResult=true&originalquery={}&limit={},20&includeAD=true&qsact=search'.format( # urllib.request.quote(dep), urllib.request.quote(item), # urllib.request.quote(item),limit) # time.sleep(1) # strhtml = requests.get(url) # result = { # 'date': time.strftime('%Y-%m-%d', time.localtime(time.time())), # 'dep': dep, # 'arrive': item, # 'limit': limit, # 'result': strhtml.json() # } # print(strhtml.text) # sheet_qunar_zyx.insert_one(result) def get_list(dep,item): url = 'https://touch.dujia.qunar.com/list?modules=list,bookingInfo&dep={}&query={}&mtype=all&ddt=false&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=FreetripTouchin&et=FreetripTouch&date=&configDepNew=&needNoResult=true&originalquery={}&limit=0,20&includeAD=true&qsact=search'.format( urllib.request.quote(dep), urllib.request.quote(item), urllib.request.quote(item)) strhtml = get_json(url) try: routeCount = int(strhtml['data']['limit']['routeCount']) except: return for limit in range(0, routeCount, 20): url = 'https://touch.dujia.qunar.com/list?modules=list,bookingInfo&dep={}&query={}&mtype=all&ddt=false&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=FreetripTouchin&et=FreetripTouch&date=&configDepNew=&needNoResult=true&originalquery={}&limit={},20&includeAD=true&qsact=search'.format( urllib.request.quote(dep), urllib.request.quote(item), urllib.request.quote(item), limit) strhtml = get_json(url) result = { 'date': time.strftime('%Y-%m-%d', time.localtime(time.time())), 'dep': dep, 'arrive': item, 'limit': limit, 'result': strhtml } sheet_qunar_zyx.insert_one(result) def connect_mongo(): client=pymongo.MongoClient('127.0.0.1',27017) book_qunar=client['qunar'] return book_qunar['qunar_zyx'] def get_json(url): strhtml=requests.get(url) time.sleep(1) return strhtml.json() if __name__ == "__main__": url='https://touch.dujia.qunar.com/depCities.qunar' dep_dict=get_json(url) for dep_item in dep_dict['data']: for dep in dep_dict['data'][dep_item]: a = [] url = 'https://m.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175'.format(urllib.request.quote(dep)) arrive_dict = get_json(url) for arr_item in arrive_dict['data']: for arr_item_1 in arr_item['subModules']: for query in arr_item_1['items']: if query['query'] not in a: a.append(query['query']) for item in a: get_list(dep,item)

浙公网安备 33010602011771号