【C#爬虫】抓取XX网站mp4资源地址
抓取小视频的url地址,然后将地址信息拷贝到迅雷里批量下载就ok了
主程序 代码
//yazhouqingseAV 35 //zhifusiwaAV 29 //zipaishipin 30 //oumeiqingseAV 28 //katongdongman 31 //tongxingAV 32 //sanjidianying 33 //fengkuangqunjiao 34 var client = new WinHttpHelper(); var type = "fengkuangqunjiao"; var classid = 34; for (int i = 1; i > -1; i++) { Console.WriteLine(i); var index = "_" + i; if (i == 1) index = ""; string pageUrl = "http://www.lang34.com/se/" + type + "/index" + index + ".html"; var trs = RegexHelper.GetMathList(client.GET(pageUrl, Encoding.UTF8), "" + type + "/(.*?).html"); foreach (var item in trs) { string temp = ""; if (RegexHelper.GetMatchStr(item.ToString(), "" + type + "/(.*?).html", true, out temp)) { string url = "http://www.lang34.com/e/DownSys/play/?classid=" + classid + "&id=" + temp + "&pathid=0"; string htmltext = client.GET(url, Encoding.UTF8); string mp4 = ""; if (RegexHelper.GetMatchStr(htmltext, "f:'(.*?)',", true, out mp4)) { string titile = ""; RegexHelper.GetMatchStr(htmltext, " <title>(.*?)</title>", true, out titile); string output = mp4 + "?title" + titile + "\r\n"; Console.WriteLine(output); File.AppendAllText("D://" + type + ".txt", output); } } } }
网络请求类
using System; using System.Collections.Generic; using System.Text; namespace MyHelper4Web { public class WinHttpHelper { WinHttp.WinHttpRequest request; public string Accept = "*/*"; public string UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C; InfoPath.2; .NET4.0E)"; public string ContentType = "application/json";// "application/x-www-form-urlencoded"; public int SetTimeOut = 60;//请求超时时间秒 public bool AllowAutoRedirect = true;//是否允许自动跳转 public bool AllowHttpstoHttp = false;//是否允许http与https转换 public WinHttpHelper() { request = new WinHttp.WinHttpRequest(); } /// <summary> /// 传入请求头的HttpHelper构造函数 /// </summary> /// <param name="Accept">Accept</param> /// <param name="UserAgent">UserAgent</param> /// <param name="ContentType">ContentType</param> public WinHttpHelper(string Accept, string UserAgent, string ContentType) { this.Accept = Accept; this.UserAgent = UserAgent; this.ContentType = ContentType; } /// <summary> /// 传入请求头的HttpHelper构造函数 /// </summary> /// <param name="Accept">Accept</param> /// <param name="UserAgent">UserAgent</param> /// <param name="ContentType">ContentType</param> /// <param name="SetTimeOut">SetTimeOut</param> public WinHttpHelper(string Accept, string UserAgent, string ContentType, int SetTimeOut) { this.Accept = Accept; this.UserAgent = UserAgent; this.ContentType = ContentType; this.SetTimeOut = SetTimeOut; } /// <summary> /// GET方式请求网页 /// </summary> /// <param name="Url">请求的url</param> /// <returns>以字节数组形式返回响应内容</returns> public byte[] GET(string Url,string refer) { byte[] responsebody; try { //不允许自动跳转 if (AllowAutoRedirect == false) { request.set_Option(WinHttp.WinHttpRequestOption.WinHttpRequestOption_EnableRedirects, false); } //允许https与http转换 if (AllowHttpstoHttp == true) { request.set_Option(WinHttp.WinHttpRequestOption.WinHttpRequestOption_EnableHttpsToHttpRedirects, true); } request.Open("GET", Url, true); request.SetRequestHeader("Accept", Accept); request.SetRequestHeader("User-Agent", UserAgent); if (!string.IsNullOrEmpty(refer)) { request.SetRequestHeader("Referer", refer); } request.Send(""); request.WaitForResponse(SetTimeOut); responsebody = (byte[])request.ResponseBody; } catch (Exception ex) { responsebody = Encoding.Default.GetBytes(ex.Message + ex.Source); ////LogHelper.Log.Error("GET方式请求网页异常", ex); } return responsebody; } /// <summary> /// GET方式请求网页 /// </summary> /// <param name="Url">请求的url</param> /// <param name="Encode">转换字符串用的编码</param> /// <returns>以字符串形式返回响应内容</returns> public string GET(string Url, Encoding Encode) { string htmltext = ""; try { byte[] htmlbyte = GET(Url,""); htmltext = Encode.GetString(htmlbyte); } catch (Exception ex) { htmltext = ex.Message + ex.Source; ////LogHelper.Log.Error("GET方式请求网页异常", ex); } return htmltext; } public string GET(string Url,string refer , Encoding Encode) { byte[] htmlbyte = GET(Url, refer); return Encode.GetString(htmlbyte); } /// <summary> /// POST方式请求网页 /// </summary> /// <param name="Url">请求的Url</param> /// <param name="PostData">请求传的值</param> /// <param name="Refer">Refer</param> /// <returns>以字节数组形式返回响应内容</returns> public byte[] POST(string Url, string PostData, string Refer) { byte[] responsebody; try { //不允许自动跳转 if (AllowAutoRedirect == false) { request.set_Option(WinHttp.WinHttpRequestOption.WinHttpRequestOption_EnableRedirects, false); } //允许https与http转换 if (AllowHttpstoHttp == true) { request.set_Option(WinHttp.WinHttpRequestOption.WinHttpRequestOption_EnableHttpsToHttpRedirects, true); } request.Open("POST", Url, true); request.SetRequestHeader("Accept", Accept); request.SetRequestHeader("User-Agent", UserAgent); request.SetRequestHeader("Content-Type", ContentType); if (!string.IsNullOrEmpty(Refer)) { request.SetRequestHeader("Referer", Refer); } request.Send(PostData); request.WaitForResponse(SetTimeOut); responsebody = (byte[])request.ResponseBody; } catch (Exception ex) { responsebody = Encoding.Default.GetBytes(ex.Message + ex.Source); ////LogHelper.Log.Error("POST方式请求网页异常", ex); } return responsebody; } /// <summary> /// POST方式请求网页 /// </summary> /// <param name="Url">请求的Url</param> /// <param name="PostData">请求传的值</param> /// <returns>以字节数组形式返回响应内容</returns> public byte[] POST(string Url, string PostData) { byte[] responsebody; responsebody = POST(Url, PostData, ""); return responsebody; } /// <summary> /// POST方式请求网页 /// </summary> /// <param name="Url">请求的Url</param> /// <param name="PostData">请求传的值</param> /// <param name="Refer">Refer</param> /// <param name="Encode">转换字符串用的编码</param> /// <returns>以字符串形式返回响应内容</returns> public string POST(string Url, string PostData, string Refer, Encoding Encode) { string htmltext = string.Empty; try { byte[] responsebody = POST(Url, PostData, Refer); htmltext = Encode.GetString(responsebody); } catch (Exception ex) { htmltext = ex.Message + ex.Source; ////LogHelper.Log.Error("POST方式请求网页异常", ex); } return htmltext; } /// <summary> /// POST方式请求网页 /// </summary> /// <param name="Url">请求的Url</param> /// <param name="PostData">请求传的值</param> /// <param name="Encode">转换字符串用的编码</param> /// <returns>以字符串形式返回响应内容</returns> public string POST(string Url, string PostData, Encoding Encode) { string htmltext = string.Empty; try { byte[] responsebody = POST(Url, PostData, ""); htmltext = Encode.GetString(responsebody); } catch (Exception ex) { htmltext = ex.Message + ex.Source; ////LogHelper.Log.Error("POST方式请求网页异常", ex); } return htmltext; } public string GetAllCookis() { string cookis = ""; try { cookis = request.GetAllResponseHeaders(); } catch (Exception) { return ""; } return cookis; } } }
正则表达式类
using System; using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; using System.Collections; namespace MyHelper4Web { public class RegexHelper { /// <summary> /// /// </summary> /// <param name="htmltext"></param> /// <param name="pattern"></param> /// <param name="isCut"></param> /// <param name="result"></param> /// <returns></returns> public static bool GetMatchStr(string htmltext, string pattern, bool isCut, out string result) { bool IsGetSuccess = false; result = ""; try { IsGetSuccess = GetMatchStr(htmltext, pattern, out result); if (!isCut) { string[] replaceStrs = new string[2]; if (pattern.Contains("(.*?)")) { string splitStr = pattern.Replace("(.*?)", "|"); replaceStrs = splitStr.Split('|'); } result = replaceStrs[0] + result + replaceStrs[1]; } } catch (Exception ex) { IsGetSuccess = false; } return IsGetSuccess; } public static string GetMatchString(string htmltext, string pattern, bool isCut) { string result = ""; try { GetMatchStr(htmltext, pattern, out result); if (isCut) { string[] replaceStrs = new string[2]; if (pattern.Contains("(.*?)")) { string splitStr = pattern.Replace("(.*?)", "|"); replaceStrs = splitStr.Split('|'); } result = result.Replace(replaceStrs[0], "").Replace(replaceStrs[1], ""); } return result; } catch (Exception ex) { return ""; } } /// <summary> /// 正则表达式dan匹配方法 /// </summary> /// <param name="htmltext">网页内容</param> /// <param name="pattern">模式字符串</param> /// <param name="result">返回匹配成功的字符串</param> /// <returns>匹配是否成功</returns> public static bool GetMatchStr(string htmltext, string pattern, out string result) { bool IsGetSuccess = false; result = ""; try { string[] replaceStrs=new string[2]; if (pattern.Contains("(.*?)")) { string splitStr = pattern.Replace("(.*?)", "^"); replaceStrs = splitStr.Split('^'); } Regex regex = new Regex(pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase); Match match = regex.Match(htmltext); if (match.Success) { result = match.ToString(); result = result.Replace(replaceStrs[0], "").Replace(replaceStrs[1], ""); } else { IsGetSuccess = false; } } catch (Exception ex) { IsGetSuccess = false; } finally { if (!string.IsNullOrEmpty(result)) { IsGetSuccess = true; } else { IsGetSuccess = false; } } return IsGetSuccess; } /// <summary> /// 正则多匹配,返回匹配ArrayList数组 /// </summary> /// <param name="htmltext">网页内容</param> /// <param name="pattern">模式字符串</param> /// <returns></returns> public static ArrayList GetMathList(string htmltext, string pattern) { ArrayList list = new ArrayList(); try { MatchCollection mc; //定义一个Regex对象实例 Regex regex = new Regex(pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase); //或者多行匹配模式RegexOptions.Multiline mc = regex.Matches(htmltext); //在输入字符串中找到所有匹配 for (int i = 0; i < mc.Count; i++) { //匹配一条信息就处理 string groupcode = mc[i].Value.ToString(); //处理函数 list.Add(groupcode); } } catch (Exception) { return null; } return list; } ///// <summary> ///// 正则表达式duo匹配方法 ///// </summary> ///// <param name="htmltext">网页内容</param> ///// <param name="patterns">模式字符串数组</param> ///// <param name="result">返回匹配成功的字符串</param> ///// <returns>匹配是否成功</returns> //public static bool GetMathStr(string htmltext, string[] patterns, out string result) //{ // bool IsGetSuccess = false; // result = ""; // try // { // string temp = htmltext; // for (int i = 0; i < patterns.Length; i++) // { // Regex regex = new Regex(patterns[i], RegexOptions.Singleline | RegexOptions.IgnoreCase); // Match match = regex.Match(temp); // if (match.Success) // { // temp = match.ToString(); // if (i == patterns.Length - 1) // { // result = temp; // } // } // else // { // break; // } // } // } // catch (Exception ex) // { // IsGetSuccess = false; // } // finally // { // if (!string.IsNullOrEmpty(result)) // { // IsGetSuccess = true; // } // else // { // IsGetSuccess = false; // } // } // return IsGetSuccess; //} } }