4.5 代码优化
代码优化可以提高代码的可读性,这里使用def自定义函数对本书4.4节的代码进行优化。
import requests import urllib import time import pymongo client = pymongo.MongoClient('localhost',27017) book_qunar = client['qunar'] sheet_qunar_111 = book_qunar['qunar_111'] headers = { "cookie":"QN1=00001d80075816179a202bf8; QN300=organic; QN48=tc_f16fbad98113c317_16b788ad2eb_c6fe; _RF1=36.110.118.134; _RSG=s2q26YN7uhCLRxFTBFcimB; _RDG=28a5b8a8ad72ce2" } def get_json(url): strhtml = requests.get(url, headers=headers) time.sleep(1) return strhtml.json() def get_list(dep,item): url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&originalquery=%E4%B8%BD%E6%B1%9F%E8%87%AA%E7%94%B1%E8%A1%8C&limit=0,20&includeAD=true&qsact=search'.format( urllib.request.quote(dep), urllib.request.quote(item), urllib.request.quote(item) ) strhtml = get_json(url) routeCount = int(strhtml['data']['limit']['routeCount']) for limit in range(0, routeCount, 20): url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&originalquery=%E4%B8%BD%E6%B1%9F%E8%87%AA%E7%94%B1%E8%A1%8C&limit=0,20&includeAD=true&qsact=search'.format( urllib.request.quote(dep), urllib.request.quote(item), urllib.request.quote(item), limit) strhtml = get_json(url) result = { 'date': time.strftime('%Y-%m-%d', time.localtime(time.time())), 'dep': dep, 'arrive': item, 'limit': limit, 'result': strhtml } sheet_qunar_111.insert_one(result) if __name__ == "__main__": url = 'https://touch.dujia.qunar.com/depCities.qunar' dep_dict = get_json(url) for dep_item in dep_dict['data']: for dep in dep_dict['data'][dep_item]: a = [] # 新增去重代码 url = 'https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175'.format( urllib.request.quote(dep)) arrive_dict = get_json(url) for arr_item in arrive_dict['data']: for arr_item_1 in arr_item['subModules']: for query in arr_item_1['items']: if query['query'] not in a: # 新增去重代码 a.append(query['query']) # 新增去重代码 for item in a: get_list(dep,item)
下面获取网页结果JSON作为一个自定义函数,传入参数为url(要访问的地址),代码如下。
def get_json(url): strhtml = requests.get(url, headers=headers) time.sleep(1) return strhtml.json()
下面获取产品列表信息作为一个自定义函数,传入参数为dep(出发地)和item(目的地),代码如下。
def get_list(dep,item): url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&originalquery=%E4%B8%BD%E6%B1%9F%E8%87%AA%E7%94%B1%E8%A1%8C&limit=0,20&includeAD=true&qsact=search'.format( urllib.request.quote(dep), urllib.request.quote(item), urllib.request.quote(item) ) strhtml = get_json(url) routeCount = int(strhtml['data']['limit']['routeCount']) for limit in range(0, routeCount, 20): url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&originalquery=%E4%B8%BD%E6%B1%9F%E8%87%AA%E7%94%B1%E8%A1%8C&limit=0,20&includeAD=true&qsact=search'.format( urllib.request.quote(dep), urllib.request.quote(item), urllib.request.quote(item), limit) strhtml = get_json(url) result = { 'date': time.strftime('%Y-%m-%d', time.localtime(time.time())), 'dep': dep, 'arrive': item, 'limit': limit, 'result': strhtml } sheet_qunar_111.insert_one(result) if __name__ == "__main__": url = 'https://touch.dujia.qunar.com/depCities.qunar' dep_dict = get_json(url) for dep_item in dep_dict['data']: for dep in dep_dict['data'][dep_item]: a = [] # 新增去重代码 url = 'https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175'.format( urllib.request.quote(dep)) arrive_dict = get_json(url) for arr_item in arrive_dict['data']: for arr_item_1 in arr_item['subModules']: for query in arr_item_1['items']: if query['query'] not in a: # 新增去重代码 a.append(query['query']) # 新增去重代码 for item in a: get_list(dep,item)
if __name__ == "__main__"表示如果直接执行某个.py文件,那么该文件中“__name__ == “__main__””结果是True,将执行if __name__ =="__main__"下面定义的代码块;如果从另外一个.py文件通过import导入该文件,那么__name__的值就是这个.py文件的名字而不是__main__,精不会执行if __name__ =="__main__"下面定义的代码块。也就是用户写的脚本模块既可以导入别的模块使用,也可以直接执行该模块。
写好爬虫后,需要一个程序定时监控运行结果,具体步骤如下。
⑴新建一个Python File,命名为test2.py,如图所示。
⑵设置为每10秒监控一次数据库的记录数,在test2.py中输入以下代码。
from best_code import sheet_qunar_111 import time while True: print(sheet_qunar_111.find().count()) time.sleep(10)
代码运行结果如图所示。
from best_code import sheet_qunar_111表示从best.code.py文件中调用名为 sheet_qunar_111的对象。需要注意的是,Python不能读取自定义函数里面的对象,因此在定义MongoDB的时候,不能在函数或者子过程中定义。