python操作mongodb(一)
- 案例1
import pymongo
# 1、连接对象
conn = pymongo.MongoClient('mongodb://root:root@192.168.128.100:27017/')
# 2、库对象
db = conn['niit']
# 3、集合对象
myset = db['student']
# 4、插入文档 - 一次插入1条文档
myset.insert_one({'name':'泰坦尼克号','star':'T','time':'1990-01-01'})
# 5、插入文档 - 一次性插入多条文档 [{},{},{}]
film_li = [
{'name':'风之子','star':'Tom','time':'1991-01-01'},
{'name':'雄霸天下','star':'Dong','time':'1992-01-01'}
]
myset.insert_many(film_li)
-
查看
-
案例2
import pprint
import pymongo
import requests
import parsel
import csv
import time
import traceback
import sys
from lxml import etree
from utils import fake_useragent
import random
class LianJia:
# 构造方法
def __init__(self):
self.headers = {
"User-Agent": fake_useragent.get_ua(),
"Cookie":"rocketchatscreenshare=chrome; rocketchatscreenshare=chrome; SECKEY_ABVK=CGOJ6MuYOjFKNa33qPFsY00sfNkK2nX0/sCiQE02m0o%3D; BMAP_SECKEY=TklXxhQMsfiONmLL17H4aZzdZZZB9a7lag1vllR69Ts26TI4wWejhyyFgvk20iwTpTa4HsCqfKVuFsSioaB6-JSuGk5MZoWXJl2Q2aOMBZlsChsVdxKvxmCwbC1WI9BOACy7zn5cfsyDDCA_zxqvDt8GRGdSSGqoON1ISET02JUoSNq0VmbHLpp4uryBzrPi; lianjia_uuid=f5ec511d-5076-4fe8-9459-f69ba634db20; _smt_uid=65917081.4da845db; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2218cc01f7d4111c-0b0cab615f36d1-26001951-836920-18cc01f7d42d60%22%2C%22%24device_id%22%3A%2218cc01f7d4111c-0b0cab615f36d1-26001951-836920-18cc01f7d42d60%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _ga=GA1.2.662380443.1704030340; _ga_KJTRWRHDL1=GS1.2.1704030340.1.1.1704030548.0.0.0; _ga_QJN1VP0CMS=GS1.2.1704030340.1.1.1704030548.0.0.0; rocketchatscreenshare=chrome; _jzqc=1; _qzjc=1; _gid=GA1.2.1823928420.1704848679; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1704030338,1704848703; _jzqx=1.1704852239.1704852239.1.jzqsr=cq%2Elianjia%2Ecom|jzqct=/ershoufang/pg2/.-; select_city=500000; lianjia_ssid=37dc69a9-31d7-4ec6-9b75-bbcfb45a1ff4; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1704938861; _qzja=1.1249653452.1704030559946.1704852238582.1704938861736.1704855597566.1704938861736.0.0.0.15.5; _qzjto=1.1.0; _jzqa=1.132749057428865760.1704030338.1704852239.1704938862.5; _jzqckmp=1; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiOTA2MTJlOTFmYzEyNjZmYjY2Y2UzYTQ5ZGU3NTFmZTg4MTcwMTYwODgzNzNhNDllNjhhMzcxNzdkZjNjNjEzOTIyNjVlZTEwMTIwZmYwOTJjMzAxZjg5YzkwNjIzYTU1NWFlYzhkNDhhYzM4OTBhY2RlYThhMzQzNzBiZjE0NDM4NTczZTVmMDE1ZTE5YmQ4OGNhYjI3MGQyNjdkMmRiMmM3ZjhjZWU5NWQ1ODBmOGExMDAzYzZiOGJjYjI4NzE3OTM0MDI4MTFlODI1MDJlM2FjY2M1ZjgyM2MwMWU3NGIzZTEyNDkxMGFmM2NlOWFiNjRkYzliYzc0ZDZhN2ZkY1wiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCJjYmUwOGFkYVwifSIsInIiOiJodHRwczovL2NxLmxpYW5qaWEuY29tL2Vyc2hvdWZhbmcvcGc0LyIsIm9zIjoid2ViIiwidiI6IjAuMSJ9; _jzqb=1.1.10.1704938862.1; _qzjb=1.1704938861736.1.0.0.0; _gat=1; _gat_global=1; _gat_new_global=1; _gat_dianpu_agent=1; _ga_PV625F3L95=GS1.2.1704938863.5.0.1704938863.0.0.0"
,"Connection":"keep-alive",
"Host":"cq.lianjia.com"
}
self.f = open("lianjia.csv", mode="a", encoding="utf-8", newline="")
self.fieldnames =["title" ,"areaName" ,"communityName" ,"hu_xing"]
self.csv_writer = csv.DictWriter(self.f, fieldnames=self.fieldnames, delimiter='\t')
self.csv_writer.writeheader()
self.conn = pymongo.MongoClient('mongodb://root:root@192.168.128.100:27017/')
self.db = self.conn['niit']
self.myset = self.db['housedb']
# 抓取数据
def getHtml(self):
for num in range(16, 20):
time.sleep(random.randint(1, 3))
url = f"https://cq.lianjia.com/ershoufang/pg{num}/"
response = requests.get(url=url, headers=self.headers)
# # 获取源代码
print( url)
html = etree.HTML(response.text)
houselist =self.parseHtml(html)
# self.save_html(houselist)
self.save_html2_mongo(houselist)
def parseHtml(self, html):
href = html.xpath('//div[@class="title"]/a/@href')
print( href)
houselist = []
for link in href:
house = {}
response2 = requests.get(url=link, headers=self.headers)
# 详情页数据
# 二次请求
html2 = etree.HTML(response2.text)
try:
# 获取数据
house['title'] = html2.xpath('//div[@class="sellDetailHeader"]//h1/text()') # 标题
house['title'] = house['title'][0] if house['title'] else None
areaName = html2.xpath('//div[@class="areaName"]/span[2]/a[1]/text()') # 获取区域 (extract_first())
house['areaName'] = areaName[0]
communityName = html2.xpath('//div[@class="communityName"]/a[1]/text()') # 获取小区名称
house['communityName'] = communityName[0]
hu_xing = html2.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[1]/text()') # 获取户型
house['hu_xing'] = str(hu_xing[1]).strip() if hu_xing else None
chao_xiang = html2.xpath('//div[@class="base"]/div[@class="content"]/ul/li[7]/text()') # 获取朝向
house['chao_xiang'] = chao_xiang[0]
print(house)
houselist.append(house)
except Exception as err:
# 获取异常信息
exc_type, exc_value, exc_traceback = sys.exc_info()
# 打印异常所在行
print(f"异常发生在第 {exc_traceback.tb_lineno} 行")
# 打印异常详细信息
traceback.print_exception(exc_type, exc_value, exc_traceback)
return houselist
def save_html2_mongo(self, dicts):
self.myset.insert_many(dicts)
def save_html(self, houselist):
if len(houselist )==0:
return
print( len(houselist ))
for row in houselist:
print( row)
self.csv_writer.writerow(row)
def run(self):
self.getHtml()
if __name__ == '__main__':
spider = LianJia()
spider.run()
-
查看
-
案例3
import pandas as pd
import pymongo
# 连接 MongoDB 数据库
client =pymongo.MongoClient('mongodb://root:root@192.168.128.100:27017/') # 替换为你的 MongoDB 地址
# self.conn = pymongo.MongoClient('mongodb://root:root@192.168.249.100:27017/')
# self.db = self.conn['niit']
# self.myset = self.db['housedb']
# 选择数据库和集合
db = client['niit'] # 替换为你的数据库名称
collection = db['housedb'] # 替换为你的集合名称
# 查询 MongoDB 中的数据,将结果转为 DataFrame
data = pd.DataFrame(list(collection.find()))
# 关闭 MongoDB 连接
client.close()
print(data.columns)
print(data['title'])
print(data.head())
# 打印 DataFrame
print(data.describe())
print(data.info)
- 控制台打印
Index(['_id', 'title', 'areaName', 'communityName', 'hu_xing', 'chao_xiang'], dtype='object')
0 弹子石轻轨站旁精装两房 户型方正 采光好
1 石桥铺秋玉景苑 正规2室2厅出售看房方便
2 轻轨房,住家装修,小区环境优美,配套齐全
3 小区环境优美,价格亲民,配套设施完善
4 万象城商圈 精装2房 中高楼层 视野好
...
114 精装两房 急售 欢迎看房 价格可谈
115 户型方正采光好居家方便家装房居家两室
116 东原物业,楼层好,交通方便,住家安静,诚心卖
117 南坪商圈 四公里双轻轨 蓝山日记 住家两房 近永辉
118 交通方便,配套齐全。装修保持好,大两房急售。
Name: title, Length: 119, dtype: object
_id ... chao_xiang
0 659f501bc1821d1d9101599d ... \n
1 659f501bc1821d1d9101599e ... \n
2 659f501bc1821d1d9101599f ... \n
3 659f501bc1821d1d910159a0 ... \n
4 659f501bc1821d1d910159a1 ... \n
[5 rows x 6 columns]
_id ... chao_xiang
count 119 ... 119
unique 119 ... 1
top 659f501bc1821d1d9101599d ... \n
freq 1 ... 119
[4 rows x 6 columns]
<bound method DataFrame.info of _id ... chao_xiang
0 659f501bc1821d1d9101599d ... \n
1 659f501bc1821d1d9101599e ... \n
2 659f501bc1821d1d9101599f ... \n
3 659f501bc1821d1d910159a0 ... \n
4 659f501bc1821d1d910159a1 ... \n
.. ... ... ...
114 659f5083c1821d1d91015a0f ... \n
115 659f5083c1821d1d91015a10 ... \n
116 659f5083c1821d1d91015a11 ... \n
117 659f5083c1821d1d91015a12 ... \n
118 659f5083c1821d1d91015a13 ... \n
[119 rows x 6 columns]>