C#使用phantomjs,爬取AJAX加载完成之后的页面
1、开发思路:入参根据apiSetting配置文件,分配静态文件存储地址,可实现不同站点的静态页生成功能。静态页生成功能使用无头浏览器生成,生成之后的字符串进行正则替换为固定地址,实现本地正常访问。
2、已发现问题:如果js在载入页面时进行某些重写dom操作,已用正则替换掉的动态路径代码,会被覆盖,导致本地访问无效。 这一点只能是站点开发那边重新对页面进行优化,从而避免这种情况。 但是这仅影响本地情况,如果静态页面部署到服务器,使用相对路径其实也不会影响。
using Newtonsoft.Json; using Newtonsoft.Json.Linq; using System; using System.Collections.Generic; using System.Diagnostics; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Web; using System.Web.Mvc; namespace QuartZNetService.Controllers { public class BuildStaticController : Controller { /// <summary> /// 配置地址 /// </summary> public static string jsonUrl = AppDomain.CurrentDomain.BaseDirectory + "apiSetting.json"; /// <summary> /// 网站配置类 /// </summary> public class HttpConfig { /// <summary> /// 网站cookie信息 /// </summary> public string Cookie { get; set; } /// <summary> /// 页面Referer信息 /// </summary> public string Referer { get; set; } /// <summary> /// 默认(text/html) /// </summary> public string ContentType { get; set; } public string Accept { get; set; } public string AcceptEncoding { get; set; } /// <summary> /// 超时时间(毫秒)默认100000 /// </summary> public int Timeout { get; set; } public string UserAgent { get; set; } /// <summary> /// POST请求时,数据是否进行gzip压缩 /// </summary> public bool GZipCompress { get; set; } public bool KeepAlive { get; set; } public string CharacterSet { get; set; } public HttpConfig() { this.Timeout = 100000; this.ContentType = "text/html; charset=" + Encoding.UTF8.WebName; this.UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"; this.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"; this.AcceptEncoding = "gzip,deflate"; this.GZipCompress = false; this.KeepAlive = true; this.CharacterSet = "UTF-8"; } } /// <summary> /// 利用phantomjs 爬取AJAX加载完成之后的页面 /// JS脚本刷新时间间隔为3秒,防止页面AJAX请求时间过长导致数据无法获取 /// </summary> /// <param name="url"></param> /// <param name="sitId">站点ID 用于配置站点盘符位置</param> /// <param name="type">存储文件夹 可为空</param> /// <param name="config"></param> /// <param name="interval"></param> /// <returns></returns> public JsonResult Do(string url, string sitId, string typeId, string fileName, HttpConfig config, int interval = 3000) { try { var readjson = Readjson(sitId, typeId); JObject jo = (JObject)JsonConvert.DeserializeObject(readjson.ToString()); var sitUrl = jo["url"].ToString(); var folder = jo["folder"].ToString(); string path = System.AppDomain.CurrentDomain.BaseDirectory.ToString(); ProcessStartInfo start = new ProcessStartInfo(path + @"webTools\phantomjs.exe");//设置运行的命令行文件问ping.exe文件,这个文件系统会自己找到 start.WorkingDirectory = path + @"webTools\"; //设置命令参数 string commond = string.Format("{0} {1} {2} {3} {4} {5}", path + @"webTools\codes.js", url, interval, config.UserAgent, config.Accept, config.Referer); start.Arguments = commond; StringBuilder sb = new StringBuilder(); start.CreateNoWindow = true;//不显示dos命令行窗口 start.RedirectStandardOutput = true;// start.RedirectStandardInput = true;// start.UseShellExecute = false;//是否指定操作系统外壳进程启动程序 Process p = Process.Start(start); StreamReader reader = new StreamReader(p.StandardOutput.BaseStream,Encoding.UTF8);//截取输出流 //正则匹配完整外链js Regex myreg = new Regex("(http|https)://(?<domain>[^(:|/]*)"); Match myMatch = myreg.Match(url); var reader_txt = reader.ReadToEnd(); StringBuilder reader_write = new StringBuilder(reader_txt); Regex regex = new Regex("<script[^>]*?src=\"([^>]*?)\"[^>]*?>", RegexOptions.IgnoreCase);//正则匹配外链html代码 MatchCollection userMatchColl = regex.Matches(reader_txt); //自定义替换区域 bg if (userMatchColl.Count > 0) { foreach (Match matchItem in userMatchColl) { if (reader_write.ToString().IndexOf(matchItem.Value) > 0 && matchItem.Value.IndexOf("xxx.cn") == -1) { reader_write.Insert( (reader_write.ToString().IndexOf(matchItem.Value) + matchItem.Value.IndexOf("src=\"") + ("src=\"").Length), "https://www.xxx.cn" ); } } } reader_write.Replace("src=\"//", "src=\"https://");//增加https reader_write.Replace("href=\"//", "href=\"https://");//增加https reader_write.Replace("\"//images", "\"https://images");//增加https //自定义替换区域 end StreamWriter write = new StreamWriter(sitUrl + folder + "//" + fileName, false, Encoding.UTF8);//写入文件 write.Write(reader_write); write.Flush(); write.Close(); p.WaitForExit();//等待程序执行完退出进程 p.Close();//关闭进程 reader.Close();//关闭流 return Json(true, JsonRequestBehavior.AllowGet); } catch (Exception ex) { return Json(ex.Message, JsonRequestBehavior.AllowGet); } } /// <summary> /// 读取配置文件 /// </summary> /// <param name="sitId"></param> /// <param name="typeId"></param> /// <returns></returns> public static string Readjson(string sitId, string typeId) { string url = ""; string folder = ""; using (System.IO.StreamReader file = System.IO.File.OpenText(jsonUrl)) { using (JsonTextReader reader = new JsonTextReader(file)) { JObject JObject = (JObject)JToken.ReadFrom(reader); //取站点路径 var sit = JObject["sit"]; foreach (JObject item in sit) { if (item["sitId"].ToString() == sitId) { url = item["sitUrl"].ToString(); } } //取文件夹名称 可为空 var type = JObject["type"]; foreach (JObject item in type) { if (item["typeId"].ToString() == typeId) { folder = item["folder"].ToString(); } } } } return JsonConvert.SerializeObject(new { url = url, folder = folder }); } } }
codes.js 配置
var page = require('webpage').create(), system = require('system'); var url = system.args[1]; var interval = system.args[2]; var settings = { timeout: interval, encoding: "UTF-8", operation: "GET", headers: { "User-Agent": system.args[3], "Accept": system.args[4], "Accept-Language": "zh-CN,en;q=0.7,en-US;q=0.3", "Connection": "keep-alive", "Upgrade-Insecure-Requests": 1, "Connection": "keep-alive", "Pragma": "no-cache", "Cache-Control": "no-cache", "Referer": system.args[5] } } page.settings = settings; page.open(url, function (status) { phantom.outputEncoding = "UTF-8"; if (status !== 'success') { console.log('Unable to post!'); phantom.exit(); } else { setTimeout(function () { console.log(page.content); phantom.exit(); }, interval); } });
apiSetting.json 配置
{ "sit": [ { "sitId": "1", "sitUrl": "D://" }, { "sitId": "60", "sitUrl": "D://" } ], "type": [ { "typeId": "1", "folder": "zmPC" }, { "typeId": "60", "folder": "zmCP" } ] }