爬虫脚本测试:使用 Python requests 库获取百度地图上的信息
实现功能
对百度地图上指定城市们进行搜索,搜索关键字们也在代码中给出,统计搜索到的信息并输出 csv 文件。
实现过程
首先在主函数中用两个列表分别存储城市编号和关键字,然后建立 csv 文件,遍历所有城市的所有关键字的前十页,调用爬虫函数获取信息。函数运行结束后设置线程的休眠,免得影响百度服务器稳定。
函数中先把参数都设置好,把链接给出来,然后用 requests.get() 方法获取 html 文档,用正则表达式把其中的需要的内容找出来存好写入 csv 即可。
代码
import requests
import re
import csv
import time
def BusinessFromBaiduDitu(citycode, key_word, pageno):
parameter = {
"newmap": "1",
"reqflag": "pcmap",
"biz": "1",
"from": "webmap",
"da_par": "direct",
"pcevaname": "pc4.1",
"qt": "con",
"c": citycode,
"wd": key_word,
"wd2": "",
"pn": pageno,
"nn": pageno * 10,
"db": "0",
"sug": "0",
"addr": "0",
"da_src": "pcmappg.poi.page",
"on_gel": "1",
"src": "7",
"gr": "3",
"l": "12",
"tn": "B_NORMAL_MAP",
"ie": "utf-8",
"t": "1468896652886"
}
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
url = 'http://map.baidu.com/'
htm = requests.get(url, params = parameter, headers = headers)
htm = htm.text.encode('latin-1').decode('unicode_escape')
pattern = r'(?<=\baddress_norm":"\[).+?(?="ty":)'
htm = re.findall(pattern, htm)
for r in htm:
pattern = r'(?<=\b"\},"name":").+?(?=")'
name = re.findall(pattern, r)
pattern = r'.+?(?=")'
adr = re.findall(pattern, r)
pattern = r'\(.+?\['
address = re.sub(pattern, ' ', adr[0])
pattern = r'\(.+?\]'
address = re.sub(pattern, ' ', address)
pattern = r'(?<="phone":").+?(?=")'
phone = re.findall(pattern, r)
try:
if phone[0] and '",' != phone[0]:
phone_list = phone[0].split(sep = ',')
for number in phone_list:
if re.match('1', number):
print(citycode + name[0] + ',' + address + ',' + number)
writer.writerow((name[0], address, number))
except: continue
if __name__ == "__main__":
citynumlist = ['257']
keywordlist = ['烟酒超市']
# 建立csv并写入标题
csvFile = open(r'F:/VSProjects/ExtractInfoFromBaiduMap/%s.csv' % 'CityData','a+', newline = '', encoding = 'ANSI')
writer = csv.writer(csvFile)
writer.writerow(('姓名', '地址', '电话'))
# 计时器和计数器
start = time.time()
num = 1
# 循环调用抓取函数
for citycode in citynumlist:
for kw in keywordlist:
for page in range(10):
BusinessFromBaiduDitu(citycode, kw, page)
# 访问控制
time.sleep(1)
if num % 20 == 0: time.sleep(2)
if num % 100 == 0: time.sleep(3)
if num % 200 == 0: time.sleep(7)
num += 1
# 输出结束时间
end = time.time()
lasttime = int((end - start))
print('耗时' + str(lasttime) + 's')