【2022.8.10】写了一个用于爬取exchange邮件的爬虫,使用exchangelib提供的线程接口,登录邮箱,下载附件,将hash保存到本地用于去重,使用了多线程提升爬取速度
from __future__ import print_function
import shutil
from exchangelib import Credentials, Account, Configuration, DELEGATE, FileAttachment, EWSDateTime
from multiprocessing.pool import Pool
from exchangelib.protocol import BaseProtocol
from exchangelib.protocol import NoVerifyHTTPAdapter
from urllib3.exceptions import InsecureRequestWarning
import urllib3
import time, os, sys, linecache
urllib3.disable_warnings(InsecureRequestWarning)
URL = ""
MAIL_SERVER = ""
suffix = ""
BaseProtocol.HTTP_ADAPTER_CLS = NoVerifyHTTPAdapter
BaseProtocol.USERAGENT = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"
FILTER = ['admin_meical', 'e_news', 'LanguageCenter', 'rtaf_news', 'weather', "Welfare", "dict1"]
AttachFilter = 'ppt'
TSTART = EWSDateTime(2020, 7, 3)
TEND = EWSDateTime(2022, 7, 3)
if not os.path.isdir("attach"):
os.mkdir("attach")
pwd_path = os.getcwd()
AttachDir = os.path.join(pwd_path, 'attach')
def R(message):
return "\033[1;91m{}\033[0;m".format(message)
def G(message):
return "\033[1;92m{}\033[0;m".format(message)
def Y(message):
return "\033[1;93m{}\033[0;m".format(message)
def B(message):
return "\033[1;94m{}\033[0;m".format(message)
def PrintException():
exc_type, exc_obj, tb = sys.exc_info()
f = tb.tb_frame
lineno = tb.tb_lineno
filename = f.f_code.co_filename
linecache.checkcache(filename)
line = linecache.getline(filename, lineno, f.f_globals)
print(R('EXCEPTION IN (LINE {} "{}"): {}'.format(lineno, line.strip(), exc_obj)))
def getAccount(username, password):
if not username.endswith(suffix):
username += "@" + suffix
credentials = Credentials(username, password)
config = Configuration(server=MAIL_SERVER, credentials=credentials)
account = Account(primary_smtp_address=username, config=config,
autodiscover=False, access_type=DELEGATE)
return account
def log(user, text):
with open(os.path.join(user, "log.txt"), "a", encoding="utf-8") as f:
f.write(text+"\n")
def getinfo(user, account):
print(user, "[*]Found {} mails in inbox, {} unread".format(
account.inbox.total_count, account.inbox.unread_count))
print("trash", account.trash.total_count)
print("outbox", account.outbox.total_count)
print("sent", account.sent.total_count)
def mkuserdir(user):
if not os.path.isdir(user):
os.mkdir(user)
else:
print(B("[*]Dir %s already exists" % user))
def download_attachments(items, user):
for item in items:
try:
pathh = "%s_%s_%s" % (item.sender.email_address.split("@")[0], str(item.datetime_received).split()[0], item.importance)
pathh = os.path.join(user, pathh)
if item.has_attachments:
for attachment in item.attachments:
if isinstance(attachment, FileAttachment):
if AttachFilter not in attachment.name:
continue
if not os.path.isdir(pathh):
os.mkdir(pathh)
if len(attachment.name) > 60:
name, ext = attachment.name.rsplit(".",1)
attach_name = "{}.{}".format(name[:55], ext)
else:
attach_name = attachment.name
attach_path = os.path.join(pathh, attach_name)
with open(attach_path, 'wb') as f, attachment.fp as fp:
buffer = fp.read(1024)
while buffer:
f.write(buffer)
buffer = fp.read(1024)
shutil.copy(attach_path, AttachDir)
log(user, '[+]Attachment saved: ' + attachment.name)
print(G("[+]Saved attachment: %s for user: %s" % (attachment.name, user)))
except Exception as e:
pass
def getinbox(account, user):
items = account.inbox.all()
download_attachments(items, user)
filtered_items = account.inbox.filter(subject__contains='foo').exclude(categories__icontains='bar')
def gettrash(account, user):
items = account.inbox.all()
download_attachments(items, user)
return 0
def getoutbox(account, user):
items = account.inbox.all()
download_attachments(items, user)
return 0
def getsent(account, user):
items = account.inbox.all()
download_attachments(items, user)
return 0
def usermail(user, passwd):
mkuserdir(user)
tries1 = 0
while tries1<2:
try:
account = getAccount(user, passwd)
break
except Exception as e:
tries1 += 1
time.sleep(20)
if tries1 == 3:
return False
tries = 0
while tries<2:
try:
if [].count(user) == 0:
getinbox(account, user)
gettrash(account, user)
getoutbox(account, user)
getsent(account, user)
return True
except Exception as e:
tries += 1
time.sleep(10)
return False
def account_entry(fp):
for i in fp:
user, passwd = i.split()
usermail(user, passwd)
if __name__ == '__main__':
fd = open("new.txt").read().split("\n")
c = 0
fd_list = []
all_list = []
thread_jobs = 80
for f in fd:
c = c + 1
if c>thread_jobs:
all_list.append(fd_list)
fd_list = []
c = 0
fd_list.append(f)
if not fd_list:
all_list.append(fd_list)
thread_num = len(all_list)
print(G("[*]Thread num is: %s" % str(thread_num)))
pool = Pool(processes=thread_num)
pool.map(account_entry, all_list)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?
· Pantheons:用 TypeScript 打造主流大模型对话的一站式集成库