使用json.dump(citys_data, f, ensure_ascii=False)写文件的时候,如果要写入汉字,则要指定ensure_ascii为False
这个代码例子为获取链家网里所有的城市,然后将按照{省名:{市名:url},{市名:url}....}的方式
import requests
from lxml import etree
import json
def get_all_city():
url = "https://www.lianjia.com/city/" # 全国城市列表
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
try:
response = requests.get(url=url, headers=headers)
except requests.exceptions.ConnectionError as e: # 有时网不好,连接错误
print(e) # HTTPSConnectionPool(host='www.lianjia.com', port=443): Max retries exceeded with url: /city/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001F0143B7810>, 'Connection to www.lianjia.com timed out. (connect timeout=None)'))
return # None
tree = etree.HTML(response.text)
citys = {}
province_list = tree.xpath("//div[@class='city_province']")
for province in province_list:
province_name = province.xpath(".//div[@class='city_list_tit c_b']/text()")[0] # 省名, 例:山东
province_city_name = province.xpath(".//ul/li/a/text()") # 市名, 例:['菏泽', '济南', '济宁', '临沂', '青岛', '泰安', '潍坊', '威海', '烟台', '淄博']
province_city_url = province.xpath(".//ul/li/a/@href") # 对应的url,例:['https://heze.lianjia.com/', 'https://jn.lianjia.com/', 'https://jining.lianjia.com/', 'https://linyi.lianjia.com/', 'https://qd.lianjia.com/', 'https://ta.lianjia.com/', 'https://wf.lianjia.com/', 'https://weihai.lianjia.com/', 'https://yt.lianjia.com/', 'https://zb.lianjia.com/']
citys[province_name] = dict(zip(province_city_name, province_city_url)) # 将市名和对应的url组成键值对
return citys
if __name__ == '__main__':
citys_data = get_all_city()
with open("citys_data.json", "w", encoding='utf-8') as f:
json.dump(citys_data, f, ensure_ascii=False)
json.dump(citys_data, f),默认ensure_ascii为True |
json.dump(citys_data, f, ensure_ascii=False),指定ensure_ascii为False,写入中文 |
{
"\u5b89\u5fbd": { "\u5b89\u5e86": "https://aq.lianjia.com/", "\u6ec1\u5dde": "https://cz.fang.lianjia.com/", "\u961c\u9633": "https://fy.lianjia.com/", "\u5408\u80a5": "https://hf.lianjia.com/", "\u9a6c\u978d\u5c71": "https://mas.lianjia.com/", "\u829c\u6e56": "https://wuhu.lianjia.com/" }, "\u5317\u4eac": { "\u5317\u4eac": "https://bj.lianjia.com/" }, "\u91cd\u5e86": { "\u91cd\u5e86": "https://cq.lianjia.com/" }, 。。。。。。。
}
|
{
"安徽": { "安庆": "https://aq.lianjia.com/", "滁州": "https://cz.fang.lianjia.com/", "阜阳": "https://fy.lianjia.com/", "合肥": "https://hf.lianjia.com/", "马鞍山": "https://mas.lianjia.com/", "芜湖": "https://wuhu.lianjia.com/" }, "北京": { "北京": "https://bj.lianjia.com/" }, "重庆": { "重庆": "https://cq.lianjia.com/" }, 。。。。。。。
}
|
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了