展开
拓展 关闭
订阅号推广码
GitHub
视频
公告栏 关闭

python操作mongodb(一)

  • 案例1
import pymongo
# 1、连接对象
conn = pymongo.MongoClient('mongodb://root:root@192.168.128.100:27017/')
# 2、库对象
db = conn['niit']
# 3、集合对象
myset = db['student']
# 4、插入文档 - 一次插入1条文档
myset.insert_one({'name':'泰坦尼克号','star':'T','time':'1990-01-01'})

# 5、插入文档 - 一次性插入多条文档 [{},{},{}]
film_li = [
    {'name':'风之子','star':'Tom','time':'1991-01-01'},
    {'name':'雄霸天下','star':'Dong','time':'1992-01-01'}
]
myset.insert_many(film_li)
  • 查看

  • 案例2

import pprint
import  pymongo
import requests
import parsel
import csv
import time
import traceback
import  sys
from lxml import etree
from utils import  fake_useragent
import random
class LianJia:
    # 构造方法
    def __init__(self):
        self.headers = {
            "User-Agent": fake_useragent.get_ua(),
            "Cookie":"rocketchatscreenshare=chrome; rocketchatscreenshare=chrome; SECKEY_ABVK=CGOJ6MuYOjFKNa33qPFsY00sfNkK2nX0/sCiQE02m0o%3D; BMAP_SECKEY=TklXxhQMsfiONmLL17H4aZzdZZZB9a7lag1vllR69Ts26TI4wWejhyyFgvk20iwTpTa4HsCqfKVuFsSioaB6-JSuGk5MZoWXJl2Q2aOMBZlsChsVdxKvxmCwbC1WI9BOACy7zn5cfsyDDCA_zxqvDt8GRGdSSGqoON1ISET02JUoSNq0VmbHLpp4uryBzrPi; lianjia_uuid=f5ec511d-5076-4fe8-9459-f69ba634db20; _smt_uid=65917081.4da845db; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2218cc01f7d4111c-0b0cab615f36d1-26001951-836920-18cc01f7d42d60%22%2C%22%24device_id%22%3A%2218cc01f7d4111c-0b0cab615f36d1-26001951-836920-18cc01f7d42d60%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _ga=GA1.2.662380443.1704030340; _ga_KJTRWRHDL1=GS1.2.1704030340.1.1.1704030548.0.0.0; _ga_QJN1VP0CMS=GS1.2.1704030340.1.1.1704030548.0.0.0; rocketchatscreenshare=chrome; _jzqc=1; _qzjc=1; _gid=GA1.2.1823928420.1704848679; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1704030338,1704848703; _jzqx=1.1704852239.1704852239.1.jzqsr=cq%2Elianjia%2Ecom|jzqct=/ershoufang/pg2/.-; select_city=500000; lianjia_ssid=37dc69a9-31d7-4ec6-9b75-bbcfb45a1ff4; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1704938861; _qzja=1.1249653452.1704030559946.1704852238582.1704938861736.1704855597566.1704938861736.0.0.0.15.5; _qzjto=1.1.0; _jzqa=1.132749057428865760.1704030338.1704852239.1704938862.5; _jzqckmp=1; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiOTA2MTJlOTFmYzEyNjZmYjY2Y2UzYTQ5ZGU3NTFmZTg4MTcwMTYwODgzNzNhNDllNjhhMzcxNzdkZjNjNjEzOTIyNjVlZTEwMTIwZmYwOTJjMzAxZjg5YzkwNjIzYTU1NWFlYzhkNDhhYzM4OTBhY2RlYThhMzQzNzBiZjE0NDM4NTczZTVmMDE1ZTE5YmQ4OGNhYjI3MGQyNjdkMmRiMmM3ZjhjZWU5NWQ1ODBmOGExMDAzYzZiOGJjYjI4NzE3OTM0MDI4MTFlODI1MDJlM2FjY2M1ZjgyM2MwMWU3NGIzZTEyNDkxMGFmM2NlOWFiNjRkYzliYzc0ZDZhN2ZkY1wiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCJjYmUwOGFkYVwifSIsInIiOiJodHRwczovL2NxLmxpYW5qaWEuY29tL2Vyc2hvdWZhbmcvcGc0LyIsIm9zIjoid2ViIiwidiI6IjAuMSJ9; _jzqb=1.1.10.1704938862.1; _qzjb=1.1704938861736.1.0.0.0; _gat=1; _gat_global=1; _gat_new_global=1; _gat_dianpu_agent=1; _ga_PV625F3L95=GS1.2.1704938863.5.0.1704938863.0.0.0"
            ,"Connection":"keep-alive",
            "Host":"cq.lianjia.com"
        }
        self.f = open("lianjia.csv", mode="a", encoding="utf-8", newline="")

        self.fieldnames =["title" ,"areaName" ,"communityName" ,"hu_xing"]

        self.csv_writer = csv.DictWriter(self.f, fieldnames=self.fieldnames, delimiter='\t')
        self.csv_writer.writeheader()

        self.conn = pymongo.MongoClient('mongodb://root:root@192.168.128.100:27017/')
        self.db = self.conn['niit']
        self.myset = self.db['housedb']

    # 抓取数据
    def getHtml(self):
        for num in range(16, 20):
            time.sleep(random.randint(1, 3))
            url = f"https://cq.lianjia.com/ershoufang/pg{num}/"
            response = requests.get(url=url, headers=self.headers)
            # # 获取源代码
            print( url)
            html = etree.HTML(response.text)
            houselist =self.parseHtml(html)
            # self.save_html(houselist)
            self.save_html2_mongo(houselist)

    def parseHtml(self, html):
        href = html.xpath('//div[@class="title"]/a/@href')
        print( href)
        houselist = []
        for link in href:
            house = {}
            response2 = requests.get(url=link, headers=self.headers)
            # 详情页数据
            # 二次请求
            html2 = etree.HTML(response2.text)
            try:
                # 获取数据
                house['title'] = html2.xpath('//div[@class="sellDetailHeader"]//h1/text()')  # 标题
                house['title'] = house['title'][0] if house['title'] else None
                areaName = html2.xpath('//div[@class="areaName"]/span[2]/a[1]/text()')  # 获取区域 (extract_first())
                house['areaName'] = areaName[0]
                communityName = html2.xpath('//div[@class="communityName"]/a[1]/text()')  # 获取小区名称
                house['communityName'] = communityName[0]
                hu_xing = html2.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[1]/text()')  # 获取户型
                house['hu_xing'] = str(hu_xing[1]).strip() if hu_xing else None
                chao_xiang = html2.xpath('//div[@class="base"]/div[@class="content"]/ul/li[7]/text()')  # 获取朝向
                house['chao_xiang'] = chao_xiang[0]
                
                print(house)
                houselist.append(house)
            except Exception as err:
                # 获取异常信息
                exc_type, exc_value, exc_traceback = sys.exc_info()

                # 打印异常所在行
                print(f"异常发生在第 {exc_traceback.tb_lineno} 行")

                # 打印异常详细信息
                traceback.print_exception(exc_type, exc_value, exc_traceback)

        return  houselist

    def save_html2_mongo(self, dicts):
        self.myset.insert_many(dicts)

    def save_html(self, houselist):
        if len(houselist )==0:
            return
        print( len(houselist ))
        for row in houselist:
            print( row)
            self.csv_writer.writerow(row)

    def run(self):
        self.getHtml()

if __name__ == '__main__':
    spider = LianJia()
    spider.run()
  • 查看

  • 案例3

import pandas as pd
import pymongo
# 连接 MongoDB 数据库
client =pymongo.MongoClient('mongodb://root:root@192.168.128.100:27017/')  # 替换为你的 MongoDB 地址

# self.conn = pymongo.MongoClient('mongodb://root:root@192.168.249.100:27017/')
# self.db = self.conn['niit']
# self.myset = self.db['housedb']

# 选择数据库和集合
db = client['niit']  # 替换为你的数据库名称
collection = db['housedb']  # 替换为你的集合名称

# 查询 MongoDB 中的数据,将结果转为 DataFrame
data = pd.DataFrame(list(collection.find()))

# 关闭 MongoDB 连接
client.close()

print(data.columns)
print(data['title'])
print(data.head())

# 打印 DataFrame
print(data.describe())
print(data.info)
  • 控制台打印
Index(['_id', 'title', 'areaName', 'communityName', 'hu_xing', 'chao_xiang'], dtype='object')
0           弹子石轻轨站旁精装两房 户型方正 采光好
1         石桥铺秋玉景苑   正规2室2厅出售看房方便
2           轻轨房,住家装修,小区环境优美,配套齐全
3             小区环境优美,价格亲民,配套设施完善
4         万象城商圈  精装2房  中高楼层  视野好
                 ...            
114      精装两房    急售   欢迎看房  价格可谈
115           户型方正采光好居家方便家装房居家两室
116       东原物业,楼层好,交通方便,住家安静,诚心卖
117    南坪商圈 四公里双轻轨 蓝山日记 住家两房 近永辉
118       交通方便,配套齐全。装修保持好,大两房急售。
Name: title, Length: 119, dtype: object
                        _id  ...                    chao_xiang
0  659f501bc1821d1d9101599d  ...  \n                          
1  659f501bc1821d1d9101599e  ...  \n                          
2  659f501bc1821d1d9101599f  ...  \n                          
3  659f501bc1821d1d910159a0  ...  \n                          
4  659f501bc1821d1d910159a1  ...  \n                          

[5 rows x 6 columns]
                             _id  ...                    chao_xiang
count                        119  ...                           119
unique                       119  ...                             1
top     659f501bc1821d1d9101599d  ...  \n                          
freq                           1  ...                           119

[4 rows x 6 columns]
<bound method DataFrame.info of                           _id  ...                    chao_xiang
0    659f501bc1821d1d9101599d  ...  \n                          
1    659f501bc1821d1d9101599e  ...  \n                          
2    659f501bc1821d1d9101599f  ...  \n                          
3    659f501bc1821d1d910159a0  ...  \n                          
4    659f501bc1821d1d910159a1  ...  \n                          
..                        ...  ...                           ...
114  659f5083c1821d1d91015a0f  ...  \n                          
115  659f5083c1821d1d91015a10  ...  \n                          
116  659f5083c1821d1d91015a11  ...  \n                          
117  659f5083c1821d1d91015a12  ...  \n                          
118  659f5083c1821d1d91015a13  ...  \n                          

[119 rows x 6 columns]>
posted @ 2024-01-11 15:12  DogLeftover  阅读(19)  评论(0编辑  收藏  举报