python操作mongodb(一)
- 案例1
import pymongo # 1、连接对象 conn = pymongo.MongoClient('mongodb://root:root@192.168.128.100:27017/') # 2、库对象 db = conn['niit'] # 3、集合对象 myset = db['student'] # 4、插入文档 - 一次插入1条文档 myset.insert_one({'name':'泰坦尼克号','star':'T','time':'1990-01-01'}) # 5、插入文档 - 一次性插入多条文档 [{},{},{}] film_li = [ {'name':'风之子','star':'Tom','time':'1991-01-01'}, {'name':'雄霸天下','star':'Dong','time':'1992-01-01'} ] myset.insert_many(film_li)
-
查看
-
案例2
import pprint import pymongo import requests import parsel import csv import time import traceback import sys from lxml import etree from utils import fake_useragent import random class LianJia: # 构造方法 def __init__(self): self.headers = { "User-Agent": fake_useragent.get_ua(), "Cookie":"rocketchatscreenshare=chrome; rocketchatscreenshare=chrome; SECKEY_ABVK=CGOJ6MuYOjFKNa33qPFsY00sfNkK2nX0/sCiQE02m0o%3D; BMAP_SECKEY=TklXxhQMsfiONmLL17H4aZzdZZZB9a7lag1vllR69Ts26TI4wWejhyyFgvk20iwTpTa4HsCqfKVuFsSioaB6-JSuGk5MZoWXJl2Q2aOMBZlsChsVdxKvxmCwbC1WI9BOACy7zn5cfsyDDCA_zxqvDt8GRGdSSGqoON1ISET02JUoSNq0VmbHLpp4uryBzrPi; lianjia_uuid=f5ec511d-5076-4fe8-9459-f69ba634db20; _smt_uid=65917081.4da845db; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2218cc01f7d4111c-0b0cab615f36d1-26001951-836920-18cc01f7d42d60%22%2C%22%24device_id%22%3A%2218cc01f7d4111c-0b0cab615f36d1-26001951-836920-18cc01f7d42d60%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _ga=GA1.2.662380443.1704030340; _ga_KJTRWRHDL1=GS1.2.1704030340.1.1.1704030548.0.0.0; _ga_QJN1VP0CMS=GS1.2.1704030340.1.1.1704030548.0.0.0; rocketchatscreenshare=chrome; _jzqc=1; _qzjc=1; _gid=GA1.2.1823928420.1704848679; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1704030338,1704848703; _jzqx=1.1704852239.1704852239.1.jzqsr=cq%2Elianjia%2Ecom|jzqct=/ershoufang/pg2/.-; select_city=500000; lianjia_ssid=37dc69a9-31d7-4ec6-9b75-bbcfb45a1ff4; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1704938861; _qzja=1.1249653452.1704030559946.1704852238582.1704938861736.1704855597566.1704938861736.0.0.0.15.5; _qzjto=1.1.0; _jzqa=1.132749057428865760.1704030338.1704852239.1704938862.5; _jzqckmp=1; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiOTA2MTJlOTFmYzEyNjZmYjY2Y2UzYTQ5ZGU3NTFmZTg4MTcwMTYwODgzNzNhNDllNjhhMzcxNzdkZjNjNjEzOTIyNjVlZTEwMTIwZmYwOTJjMzAxZjg5YzkwNjIzYTU1NWFlYzhkNDhhYzM4OTBhY2RlYThhMzQzNzBiZjE0NDM4NTczZTVmMDE1ZTE5YmQ4OGNhYjI3MGQyNjdkMmRiMmM3ZjhjZWU5NWQ1ODBmOGExMDAzYzZiOGJjYjI4NzE3OTM0MDI4MTFlODI1MDJlM2FjY2M1ZjgyM2MwMWU3NGIzZTEyNDkxMGFmM2NlOWFiNjRkYzliYzc0ZDZhN2ZkY1wiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCJjYmUwOGFkYVwifSIsInIiOiJodHRwczovL2NxLmxpYW5qaWEuY29tL2Vyc2hvdWZhbmcvcGc0LyIsIm9zIjoid2ViIiwidiI6IjAuMSJ9; _jzqb=1.1.10.1704938862.1; _qzjb=1.1704938861736.1.0.0.0; _gat=1; _gat_global=1; _gat_new_global=1; _gat_dianpu_agent=1; _ga_PV625F3L95=GS1.2.1704938863.5.0.1704938863.0.0.0" ,"Connection":"keep-alive", "Host":"cq.lianjia.com" } self.f = open("lianjia.csv", mode="a", encoding="utf-8", newline="") self.fieldnames =["title" ,"areaName" ,"communityName" ,"hu_xing"] self.csv_writer = csv.DictWriter(self.f, fieldnames=self.fieldnames, delimiter='\t') self.csv_writer.writeheader() self.conn = pymongo.MongoClient('mongodb://root:root@192.168.128.100:27017/') self.db = self.conn['niit'] self.myset = self.db['housedb'] # 抓取数据 def getHtml(self): for num in range(16, 20): time.sleep(random.randint(1, 3)) url = f"https://cq.lianjia.com/ershoufang/pg{num}/" response = requests.get(url=url, headers=self.headers) # # 获取源代码 print( url) html = etree.HTML(response.text) houselist =self.parseHtml(html) # self.save_html(houselist) self.save_html2_mongo(houselist) def parseHtml(self, html): href = html.xpath('//div[@class="title"]/a/@href') print( href) houselist = [] for link in href: house = {} response2 = requests.get(url=link, headers=self.headers) # 详情页数据 # 二次请求 html2 = etree.HTML(response2.text) try: # 获取数据 house['title'] = html2.xpath('//div[@class="sellDetailHeader"]//h1/text()') # 标题 house['title'] = house['title'][0] if house['title'] else None areaName = html2.xpath('//div[@class="areaName"]/span[2]/a[1]/text()') # 获取区域 (extract_first()) house['areaName'] = areaName[0] communityName = html2.xpath('//div[@class="communityName"]/a[1]/text()') # 获取小区名称 house['communityName'] = communityName[0] hu_xing = html2.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[1]/text()') # 获取户型 house['hu_xing'] = str(hu_xing[1]).strip() if hu_xing else None chao_xiang = html2.xpath('//div[@class="base"]/div[@class="content"]/ul/li[7]/text()') # 获取朝向 house['chao_xiang'] = chao_xiang[0] print(house) houselist.append(house) except Exception as err: # 获取异常信息 exc_type, exc_value, exc_traceback = sys.exc_info() # 打印异常所在行 print(f"异常发生在第 {exc_traceback.tb_lineno} 行") # 打印异常详细信息 traceback.print_exception(exc_type, exc_value, exc_traceback) return houselist def save_html2_mongo(self, dicts): self.myset.insert_many(dicts) def save_html(self, houselist): if len(houselist )==0: return print( len(houselist )) for row in houselist: print( row) self.csv_writer.writerow(row) def run(self): self.getHtml() if __name__ == '__main__': spider = LianJia() spider.run()
-
查看
-
案例3
import pandas as pd import pymongo # 连接 MongoDB 数据库 client =pymongo.MongoClient('mongodb://root:root@192.168.128.100:27017/') # 替换为你的 MongoDB 地址 # self.conn = pymongo.MongoClient('mongodb://root:root@192.168.249.100:27017/') # self.db = self.conn['niit'] # self.myset = self.db['housedb'] # 选择数据库和集合 db = client['niit'] # 替换为你的数据库名称 collection = db['housedb'] # 替换为你的集合名称 # 查询 MongoDB 中的数据,将结果转为 DataFrame data = pd.DataFrame(list(collection.find())) # 关闭 MongoDB 连接 client.close() print(data.columns) print(data['title']) print(data.head()) # 打印 DataFrame print(data.describe()) print(data.info)
- 控制台打印
Index(['_id', 'title', 'areaName', 'communityName', 'hu_xing', 'chao_xiang'], dtype='object') 0 弹子石轻轨站旁精装两房 户型方正 采光好 1 石桥铺秋玉景苑 正规2室2厅出售看房方便 2 轻轨房,住家装修,小区环境优美,配套齐全 3 小区环境优美,价格亲民,配套设施完善 4 万象城商圈 精装2房 中高楼层 视野好 ... 114 精装两房 急售 欢迎看房 价格可谈 115 户型方正采光好居家方便家装房居家两室 116 东原物业,楼层好,交通方便,住家安静,诚心卖 117 南坪商圈 四公里双轻轨 蓝山日记 住家两房 近永辉 118 交通方便,配套齐全。装修保持好,大两房急售。 Name: title, Length: 119, dtype: object _id ... chao_xiang 0 659f501bc1821d1d9101599d ... \n 1 659f501bc1821d1d9101599e ... \n 2 659f501bc1821d1d9101599f ... \n 3 659f501bc1821d1d910159a0 ... \n 4 659f501bc1821d1d910159a1 ... \n [5 rows x 6 columns] _id ... chao_xiang count 119 ... 119 unique 119 ... 1 top 659f501bc1821d1d9101599d ... \n freq 1 ... 119 [4 rows x 6 columns] <bound method DataFrame.info of _id ... chao_xiang 0 659f501bc1821d1d9101599d ... \n 1 659f501bc1821d1d9101599e ... \n 2 659f501bc1821d1d9101599f ... \n 3 659f501bc1821d1d910159a0 ... \n 4 659f501bc1821d1d910159a1 ... \n .. ... ... ... 114 659f5083c1821d1d91015a0f ... \n 115 659f5083c1821d1d91015a10 ... \n 116 659f5083c1821d1d91015a11 ... \n 117 659f5083c1821d1d91015a12 ... \n 118 659f5083c1821d1d91015a13 ... \n [119 rows x 6 columns]>
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 【译】Visual Studio 中新的强大生产力特性
· 【设计模式】告别冗长if-else语句:使用策略模式优化代码结构
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义