爬虫脚本测试:使用 Python requests 库获取百度地图上的信息

实现功能

对百度地图上指定城市们进行搜索,搜索关键字们也在代码中给出,统计搜索到的信息并输出 csv 文件。


实现过程

首先在主函数中用两个列表分别存储城市编号和关键字,然后建立 csv 文件,遍历所有城市的所有关键字的前十页,调用爬虫函数获取信息。函数运行结束后设置线程的休眠,免得影响百度服务器稳定。

函数中先把参数都设置好,把链接给出来,然后用 requests.get() 方法获取 html 文档,用正则表达式把其中的需要的内容找出来存好写入 csv 即可。


代码

import requests
import re
import csv
import time

def BusinessFromBaiduDitu(citycode, key_word, pageno):
    
    parameter = {
            "newmap": "1",
            "reqflag": "pcmap",
            "biz": "1",
            "from": "webmap",
            "da_par": "direct",
            "pcevaname": "pc4.1",
            "qt": "con",
            "c": citycode,
            "wd": key_word,
            "wd2": "",
            "pn": pageno,
            "nn": pageno * 10,
            "db": "0",
            "sug": "0",
            "addr": "0",
            "da_src": "pcmappg.poi.page",
            "on_gel": "1",
            "src": "7",
            "gr": "3",
            "l": "12",
            "tn": "B_NORMAL_MAP",
            "ie": "utf-8",
            "t": "1468896652886"
            }

    headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
            }

    url = 'http://map.baidu.com/'
    htm = requests.get(url, params = parameter, headers = headers)
    htm = htm.text.encode('latin-1').decode('unicode_escape')
    
    pattern = r'(?<=\baddress_norm":"\[).+?(?="ty":)'
    htm = re.findall(pattern, htm)

    for r in htm:
        pattern = r'(?<=\b"\},"name":").+?(?=")'
        name = re.findall(pattern, r)
        pattern = r'.+?(?=")'
        adr = re.findall(pattern, r)
        pattern = r'\(.+?\['
        address = re.sub(pattern, ' ', adr[0])
        pattern = r'\(.+?\]'
        address = re.sub(pattern, ' ', address)
        pattern = r'(?<="phone":").+?(?=")'
        phone = re.findall(pattern, r)

        try:
            if phone[0] and '",' != phone[0]:
                phone_list = phone[0].split(sep = ',')
                for number in phone_list:
                    if re.match('1', number):
                        print(citycode + name[0] + ',' + address + ',' + number)
                        writer.writerow((name[0], address, number))
        except: continue


if __name__ == "__main__":
    citynumlist = ['257']
    keywordlist = ['烟酒超市']

    # 建立csv并写入标题
    csvFile = open(r'F:/VSProjects/ExtractInfoFromBaiduMap/%s.csv' % 'CityData','a+', newline = '', encoding = 'ANSI')
    writer = csv.writer(csvFile)
    writer.writerow(('姓名', '地址', '电话'))

    # 计时器和计数器
    start = time.time()
    num = 1

    # 循环调用抓取函数
    for citycode in citynumlist:
        for kw in keywordlist:
            for page in range(10):
                BusinessFromBaiduDitu(citycode, kw, page)
                
                # 访问控制
                time.sleep(1)
                if num % 20 == 0:  time.sleep(2)
                if num % 100 == 0: time.sleep(3)
                if num % 200 == 0: time.sleep(7)
                num += 1
    
    # 输出结束时间
    end = time.time()
    lasttime = int((end - start))
    print('耗时' + str(lasttime) + 's')

posted @ 2021-01-14 15:27  老鼠司令  阅读(679)  评论(0编辑  收藏  举报