Web截图实现关键词高亮
代码
from selenium import webdriver
import time
import sys
import re
from PIL import Image
from io import BytesIO
DEBUG_MODE = True
with open("jquery.min.js") as jquery_file:
JQUERY_SCRIPT = jquery_file.read()
def log(msg):
if DEBUG_MODE:
print(msg)
def capture_with_highlight(url, save_fn="capture.png", keywords=None):
def init_browser():
browser = webdriver.PhantomJS(service_log_path="log/"+save_fn+".log")
browser.set_window_size(1400, 900)
browser.set_page_load_timeout(40)
browser.set_script_timeout(40)
return browser
browser=init_browser()
log("正在打开页面:"+url)
browser.get(url) # Load page
log("执行滚动脚本")
browser.execute_script("""
var y = 0;
var step = 100;
window.scroll(0, 0);
function f() {
if (y < document.body.scrollHeight) {
y += step;
window.scroll(0, y);
setTimeout(f, 100);
} else {
window.scroll(0, 0);
document.title += "scroll-done";
}
}
setTimeout(f, 1000);
""")
for i in range(30):
if "scroll-done" in browser.title:
break
time.sleep(1)
log("滚动完成,判定关键词命中并添加高亮")
if keywords:
reg = "(" + ")|(".join(keywords) + ")"
def loading_jquery(browser):
has_jq = browser.execute_script(
"""return typeof(jQuery)!="undefined" """)
if not has_jq:
log("加载jquery脚本")
browser.execute_script(JQUERY_SCRIPT)
loading_jquery(browser)
has_hit = browser.execute_script(
"""return $('body')&&/%s/.test($('body').text())""" % reg)
if not has_hit:
log("未在页面上找到关键词")
browser.close()
return False
browser.execute_script("""
var addHighlight=function(){
var re=RegExp("%s")
var hits = jQuery('body').find("*")
.filter(function () {
var obj = jQuery(this).clone();
obj.find(':nth-child(n)').remove();
return re.test(obj.text());
})
re.global = true
hits.each(function () {
var html = jQuery(this).html()
html = html.replace(re, "<span style='color:yellow;font-size:1.5em;background-color:red;font-weight:bold'>%s</span>")
jQuery(this).html(html)
});
}
addHighlight()
""" % (reg, "".join(["$%s" % (i + 1) for i in range(len(keywords))])))
png_data = browser.get_screenshot_as_png()
img = Image.open(BytesIO(png_data))
background = Image.new("RGB", img.size, (255, 255, 255))
background.paste(img, mask=img.split()[3])
background.save("img/"+save_fn, 'JPEG', quality=80)
return True
if __name__ == "__main__":
capture_with_highlight(
"https://item.jd.com/10124713723.html", keywords=["老字号"])
结果(部份截图)
流程图