从煎蛋网上抓取妹子图
原创
[python] #!/usr/bin/env python2.7 #coding: utf-8 import smtplib, sys, os, re, urllib from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from email.mime.image import MIMEImage from email.Header import Header sender = 'from@126.com' receiver = 'to@126.com' smtpserver = 'smtp.126.com' username = 'username' password = 'password' savepath = './img' reload(sys) sys.setdefaultencoding('utf-8') def readhtml(url): html = urllib.urlopen(url).read().decode('utf-8'); # 先解析出当前是第几页 <span class="current-comment-page">[页数]</span> match_page = re.search(r'"current-comment-page">\[(?P<page>.*?)\]</span>', html, re.I) page = match_page.group('page') print 'page=' + page result = [] # 解析页面中 时间,楼层,LINK,文字描述,图片 # <li id="comment-楼层"> # @</a>时间</span> # <a href="LINK"># # <p>文字描述<img # <img src="图片" /> rc_context = re.compile(r'<li id="comment-(?P<floor>.*?)">[\s\S]*?@</a>(?P<time>.*?)</span>[\s\S]*?<a href="(?P<link>.*?)">#[\s\S]*?<p>(?P<text>[\s\S]*?)<img src="(?P<image>.*?)"[\s\S]*?</li>', re.I) for mach_context in rc_context.finditer(html): floor = mach_context.group('floor') time = mach_context.group('time') link = mach_context.group('link') text = '<p>' + mach_context.group('text').strip() image = mach_context.group('image').strip() #imageName = image[image.rindex('/')+1:] #imageName = floor + imageName[imageName.rindex(".")+1:] #print '>> ' + imageName[imageName.rindex('.')+1:] urllib.urlretrieve(image, os.path.join(savepath, floor)) # 下载图片放在临时目录 dict = {'floor':floor, 'time':time, 'link':link, 'text':text} result.append(dict) print floor return result def buildmail(infos): msgRoot = MIMEMultipart('related') msgRoot['Subject'] = Header(unicode('煎蛋-妹子图', 'utf-8'), 'utf-8') context = '' for info in infos: context += '<a href="{0}">{1}</a>{2}{3}<br><img src="cid:{4}"><hr>'.format(info['link'], info['floor'], info['time'], info['text'], info['floor']) # 添加附件 fp = open(os.path.join(savepath, info['floor']), 'rb') msgImage = MIMEImage(fp.read()) fp.close() msgImage.add_header('Content-ID', '<{0}>'.format(info['floor'])) msgRoot.attach(msgImage) msgRoot.attach(MIMEText(context, _subtype='html', _charset='utf-8')) return msgRoot if __name__=='__main__': if not os.path.exists(savepath) : os.mkdir(savepath) result = readhtml('http://jandan.net/ooxx') mailbody = buildmail(result) smtp = smtplib.SMTP() smtp.connect(smtpserver) smtp.login(username, password) smtp.sendmail(sender, receiver, mailbody.as_string()) smtp.quit() print 'OK' [/python]
[python] #!/usr/bin/env python2.7 #coding: utf-8 import smtplib, sys, os, re, urllib from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from email.mime.image import MIMEImage from email.Header import Header sender = 'from@126.com' receiver = 'to@126.com' smtpserver = 'smtp.126.com' username = 'username' password = 'password' savepath = './img' reload(sys) sys.setdefaultencoding('utf-8') def readhtml(url): html = urllib.urlopen(url).read().decode('utf-8'); # 先解析出当前是第几页 <span class="current-comment-page">[页数]</span> match_page = re.search(r'"current-comment-page">\[(?P<page>.*?)\]</span>', html, re.I) page = match_page.group('page') print 'page=' + page result = [] # 解析页面中 时间,楼层,LINK,文字描述,图片 # <li id="comment-楼层"> # @</a>时间</span> # <a href="LINK"># # <p>文字描述<img # <img src="图片" /> rc_context = re.compile(r'<li id="comment-(?P<floor>.*?)">[\s\S]*?@</a>(?P<time>.*?)</span>[\s\S]*?<a href="(?P<link>.*?)">#[\s\S]*?<p>(?P<text>[\s\S]*?)<img src="(?P<image>.*?)"[\s\S]*?</li>', re.I) for mach_context in rc_context.finditer(html): floor = mach_context.group('floor') time = mach_context.group('time') link = mach_context.group('link') text = '<p>' + mach_context.group('text').strip() image = mach_context.group('image').strip() #imageName = image[image.rindex('/')+1:] #imageName = floor + imageName[imageName.rindex(".")+1:] #print '>> ' + imageName[imageName.rindex('.')+1:] urllib.urlretrieve(image, os.path.join(savepath, floor)) # 下载图片放在临时目录 dict = {'floor':floor, 'time':time, 'link':link, 'text':text} result.append(dict) print floor return result def buildmail(infos): msgRoot = MIMEMultipart('related') msgRoot['Subject'] = Header(unicode('煎蛋-妹子图', 'utf-8'), 'utf-8') context = '' for info in infos: context += '<a href="{0}">{1}</a>{2}{3}<br><img src="cid:{4}"><hr>'.format(info['link'], info['floor'], info['time'], info['text'], info['floor']) # 添加附件 fp = open(os.path.join(savepath, info['floor']), 'rb') msgImage = MIMEImage(fp.read()) fp.close() msgImage.add_header('Content-ID', '<{0}>'.format(info['floor'])) msgRoot.attach(msgImage) msgRoot.attach(MIMEText(context, _subtype='html', _charset='utf-8')) return msgRoot if __name__=='__main__': if not os.path.exists(savepath) : os.mkdir(savepath) result = readhtml('http://jandan.net/ooxx') mailbody = buildmail(result) smtp = smtplib.SMTP() smtp.connect(smtpserver) smtp.login(username, password) smtp.sendmail(sender, receiver, mailbody.as_string()) smtp.quit() print 'OK' [/python]