import asyncio
import datetime
import json
import pymongo
import requests
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from redis import Redis
from telethon import TelegramClient
# https://my.telegram.org/apps
from telethon.utils import get_display_name
# 数据库基本信息
db_configs = {
'type': 'mongo',
'host': '127.0.0.1',
'port': '27017',
"user": "",
"password": "",
'db_name': 'new_spider'
}
class Mongo():
def __init__(self):
self.db_name = db_configs.get("db_name")
self.host = db_configs.get("host")
self.port = db_configs.get("port")
self.client = pymongo.MongoClient(f'mongodb://{self.host}:{self.port}', connect=False, maxPoolSize=10)
self.username = db_configs.get("user")
self.password = db_configs.get("passwd")
if self.username and self.password:
self.db = self.client[self.db_name].authenticate(self.username, self.password)
self.db = self.client[self.db_name]
def update(self, item, col="tg_spider"):
if not isinstance(item, list):
item = [item]
for each_item in item:
coll = self.db[col]
coll.update_one({"message_id": each_item["message_id"]}, {'$set': each_item}, upsert=True)
redis_cli = Redis.from_url("redis://@localhost:6379")
last_offset_date = datetime.datetime.strptime("2022-10-01", "%Y-%m-%d") # 设置开始时间
api_id = xxxx
api_hash = xxxxxx
channel_dict = {}
client = TelegramClient('ztelegrm', api_id, api_hash, connection_retries=15, retry_delay=3)
robot_url = xxxxx
mongo_cli = Mongo()
def robot_warning(text, url):
"""
企业微信机器人
text: 文本
url: 链接
"""
if isinstance(text, dict) or isinstance(text, list):
text = json.dumps(text, ensure_ascii=False)
headers = {
"Content-Type": "application/json"
}
data = {
"msgtype": "text",
"text": {
"content": text,
}
}
return requests.post(url, headers=headers, data=json.dumps(data))
async def format_message(message, chat_id):
if not message.text or message.media:
return None
chat = await message.get_chat()
chat_display_name = get_display_name(chat)
sender_user = await message.get_sender()
content = message.text
talker = chat_id
tg_id = getattr(sender_user, 'username', '')
create_time = message.date.strftime("%s")
group_name = chat_display_name
first_name = getattr(sender_user, 'first_name', '')
last_name = getattr(sender_user, 'last_name', '')
if tg_id is None:
tg_id = "未知"
if last_name is None and first_name is None:
user_name = "未知"
else:
user_name = f"{first_name}_{last_name}"
send_data = f"{content}|$#$#|{talker}|$#$#|{tg_id}|$#$#|{create_time}|$#$#|{group_name}|$#$#|{user_name}"
redis_cli.set(f"tg_cache_id_{chat_id}", message.id)
# return send_data
doc_data = {
"timestamp": message.date.strftime("%s"),
"sender": {
"username": getattr(sender_user, 'username', ''),
"firstName": getattr(sender_user, 'first_name', ''),
"lastName": getattr(sender_user, 'last_name', '')
},
"channel": chat_display_name,
"channel_id": chat_id,
"text": message.text,
"message_id": message.id
}
return doc_data
async def get_channel_dict():
async for d in client.iter_dialogs():
channelId = d.entity.id
channelName = d.name
# channel_dict[channelId] = channelName
yield channelId, channelName
async def load_history_to_save():
channel_dict = get_channel_dict()
async for channel_info in channel_dict:
channel_id = channel_info[0]
min_id = redis_cli.get(f"tg_cache_id_{channel_id}")
if min_id is None:
min_id = 0
print(channel_id, channel_info[1])
entity = await client.get_entity(channel_id)
await asyncio.sleep(10)
message_list = []
# 每次查50条,limit=50
if min_id == 0:
# 这个地方含义是获取2022-10-01这天的数据,从旧数据到新数据的顺序,这样message id逐渐增大。
async for message in client.iter_messages(entity, reverse=True, offset_date=last_offset_date, limit=50):
doc_data = await format_message(message, channel_id)
if doc_data:
message_list.append(doc_data)
else:
# 根据message id获取数据,id逐渐增大。
async for message in client.iter_messages(entity, reverse=True, min_id=int(min_id),
limit=50):
doc_data = await format_message(message, channel_id)
if doc_data:
message_list.append(doc_data)
# robot_warning(message_list, robot_url)
mongo_cli.update(message_list)
print("发送完成")
def start():
with client:
scheduler = AsyncIOScheduler()
scheduler.add_job(load_history_to_save, 'interval', minutes=5, max_instances=1,
next_run_time=datetime.datetime.now())
scheduler.start()
client.loop.run_forever()
if __name__ == '__main__':
start()