【C#爬虫】抓取XX网站mp4资源地址

抓取小视频的url地址,然后将地址信息拷贝到迅雷里批量下载就ok了

主程序 代码

            //yazhouqingseAV 35
            //zhifusiwaAV 29
            //zipaishipin 30
            //oumeiqingseAV 28
            //katongdongman 31 
            //tongxingAV 32
            //sanjidianying 33
            //fengkuangqunjiao 34

            var client = new WinHttpHelper();
            var type = "fengkuangqunjiao";
            var classid = 34;

            for (int i = 1; i > -1; i++)
            {
                Console.WriteLine(i);
                var index = "_" + i;
                if (i == 1)
                    index = "";

                string pageUrl = "http://www.lang34.com/se/" + type + "/index" + index + ".html";

                var trs = RegexHelper.GetMathList(client.GET(pageUrl, Encoding.UTF8), "" + type + "/(.*?).html");
                foreach (var item in trs)
                {
                    string temp = "";
                    if (RegexHelper.GetMatchStr(item.ToString(), "" + type + "/(.*?).html", true, out temp))
                    {
                        string url = "http://www.lang34.com/e/DownSys/play/?classid=" + classid + "&id=" + temp + "&pathid=0";
                        string htmltext = client.GET(url, Encoding.UTF8);

                        string mp4 = "";
                        if (RegexHelper.GetMatchStr(htmltext, "f:'(.*?)',", true, out mp4))
                        {
                            string titile = "";
                            RegexHelper.GetMatchStr(htmltext, " <title>(.*?)</title>", true, out titile);

                            string output = mp4 + "?title" + titile + "\r\n";
                            Console.WriteLine(output);
                            File.AppendAllText("D://" + type + ".txt", output);
                        }
                    }

                }
            }

网络请求类

using System;
using System.Collections.Generic;
using System.Text;

namespace MyHelper4Web
{
    public class WinHttpHelper
    {
        WinHttp.WinHttpRequest request;

        public string Accept = "*/*";
        public string UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C; InfoPath.2; .NET4.0E)";
        public string ContentType = "application/json";// "application/x-www-form-urlencoded";
        public int SetTimeOut = 60;//请求超时时间秒
        public bool AllowAutoRedirect = true;//是否允许自动跳转
        public bool AllowHttpstoHttp = false;//是否允许http与https转换

        public WinHttpHelper()
        {
            request = new WinHttp.WinHttpRequest();
        }

        /// <summary>
        /// 传入请求头的HttpHelper构造函数
        /// </summary>
        /// <param name="Accept">Accept</param>
        /// <param name="UserAgent">UserAgent</param>
        /// <param name="ContentType">ContentType</param>
        public WinHttpHelper(string Accept, string UserAgent, string ContentType)
        {
            this.Accept = Accept;
            this.UserAgent = UserAgent;
            this.ContentType = ContentType;
        }

        /// <summary>
        /// 传入请求头的HttpHelper构造函数
        /// </summary>
        /// <param name="Accept">Accept</param>
        /// <param name="UserAgent">UserAgent</param>
        /// <param name="ContentType">ContentType</param>
        /// <param name="SetTimeOut">SetTimeOut</param>
        public WinHttpHelper(string Accept, string UserAgent, string ContentType, int SetTimeOut)
        {
            this.Accept = Accept;
            this.UserAgent = UserAgent;
            this.ContentType = ContentType;
            this.SetTimeOut = SetTimeOut;
        }

        /// <summary>
        /// GET方式请求网页
        /// </summary>
        /// <param name="Url">请求的url</param>
        /// <returns>以字节数组形式返回响应内容</returns>
        public byte[] GET(string Url,string refer)
        {
            byte[] responsebody;
            try
            {
                //不允许自动跳转
                if (AllowAutoRedirect == false)
                {
                    request.set_Option(WinHttp.WinHttpRequestOption.WinHttpRequestOption_EnableRedirects, false);
                }
                //允许https与http转换
                if (AllowHttpstoHttp == true)
                {
                    request.set_Option(WinHttp.WinHttpRequestOption.WinHttpRequestOption_EnableHttpsToHttpRedirects, true);
                }
                request.Open("GET", Url, true);
                request.SetRequestHeader("Accept", Accept);
                request.SetRequestHeader("User-Agent", UserAgent);
                if (!string.IsNullOrEmpty(refer))
                {
                    request.SetRequestHeader("Referer", refer);
                }
                request.Send("");
                request.WaitForResponse(SetTimeOut);
                responsebody = (byte[])request.ResponseBody;
            }
            catch (Exception ex)
            {
                responsebody = Encoding.Default.GetBytes(ex.Message + ex.Source);
                ////LogHelper.Log.Error("GET方式请求网页异常", ex);
            }
            return responsebody;
        }

        /// <summary>
        /// GET方式请求网页
        /// </summary>
        /// <param name="Url">请求的url</param>
        /// <param name="Encode">转换字符串用的编码</param>
        /// <returns>以字符串形式返回响应内容</returns>
        public string GET(string Url, Encoding Encode)
        {
            string htmltext = "";
            try
            {
                byte[] htmlbyte = GET(Url,"");
                htmltext = Encode.GetString(htmlbyte);
            }
            catch (Exception ex)
            {
                htmltext = ex.Message + ex.Source;
                ////LogHelper.Log.Error("GET方式请求网页异常", ex);
            }
            return htmltext;
        }

        public string GET(string Url,string refer , Encoding Encode)
        {
            byte[] htmlbyte = GET(Url, refer);

            return  Encode.GetString(htmlbyte);
        }

        /// <summary>
        /// POST方式请求网页
        /// </summary>
        /// <param name="Url">请求的Url</param>
        /// <param name="PostData">请求传的值</param>
        /// <param name="Refer">Refer</param>
        /// <returns>以字节数组形式返回响应内容</returns>
        public byte[] POST(string Url, string PostData, string Refer)
        {
            byte[] responsebody;
            try
            {
                //不允许自动跳转
                if (AllowAutoRedirect == false)
                {
                    request.set_Option(WinHttp.WinHttpRequestOption.WinHttpRequestOption_EnableRedirects, false);
                }
                //允许https与http转换
                if (AllowHttpstoHttp == true)
                {
                    request.set_Option(WinHttp.WinHttpRequestOption.WinHttpRequestOption_EnableHttpsToHttpRedirects, true);
                }
                request.Open("POST", Url, true);
                request.SetRequestHeader("Accept", Accept);
                request.SetRequestHeader("User-Agent", UserAgent);
                request.SetRequestHeader("Content-Type", ContentType);
                if (!string.IsNullOrEmpty(Refer))
                {
                    request.SetRequestHeader("Referer", Refer);
                }
                request.Send(PostData);
                request.WaitForResponse(SetTimeOut);
                responsebody = (byte[])request.ResponseBody;
            }
            catch (Exception ex)
            {
                responsebody = Encoding.Default.GetBytes(ex.Message + ex.Source);
                ////LogHelper.Log.Error("POST方式请求网页异常", ex);
            }
            return responsebody;
        }

        /// <summary>
        /// POST方式请求网页
        /// </summary>
        /// <param name="Url">请求的Url</param>
        /// <param name="PostData">请求传的值</param>
        /// <returns>以字节数组形式返回响应内容</returns>
        public byte[] POST(string Url, string PostData)
        {
            byte[] responsebody;
            responsebody = POST(Url, PostData, "");
            return responsebody;
        }

        /// <summary>
        /// POST方式请求网页
        /// </summary>
        /// <param name="Url">请求的Url</param>
        /// <param name="PostData">请求传的值</param>
        /// <param name="Refer">Refer</param>
        /// <param name="Encode">转换字符串用的编码</param>
        /// <returns>以字符串形式返回响应内容</returns>
        public string POST(string Url, string PostData, string Refer, Encoding Encode)
        {
            string htmltext = string.Empty;
            try
            {
                byte[] responsebody = POST(Url, PostData, Refer);
                htmltext = Encode.GetString(responsebody);
            }
            catch (Exception ex)
            {
                htmltext = ex.Message + ex.Source;
                ////LogHelper.Log.Error("POST方式请求网页异常", ex);
            }
            return htmltext;
        }

        /// <summary>
        /// POST方式请求网页
        /// </summary>
        /// <param name="Url">请求的Url</param>
        /// <param name="PostData">请求传的值</param>
        /// <param name="Encode">转换字符串用的编码</param>
        /// <returns>以字符串形式返回响应内容</returns>
        public string POST(string Url, string PostData, Encoding Encode)
        {
            string htmltext = string.Empty;
            try
            {
                byte[] responsebody = POST(Url, PostData, "");
                htmltext = Encode.GetString(responsebody);
            }
            catch (Exception ex)
            {
                htmltext = ex.Message + ex.Source;
                ////LogHelper.Log.Error("POST方式请求网页异常", ex);
            }
            return htmltext;
        }

        public string GetAllCookis()
        {
            string cookis = "";
            try
            {
                cookis = request.GetAllResponseHeaders();
            }
            catch (Exception)
            {
                return "";
            }
            return cookis;
        }
    }
}

正则表达式类

using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;

namespace MyHelper4Web
{
    public class RegexHelper
    {
        /// <summary>
        /// 
        /// </summary>
        /// <param name="htmltext"></param>
        /// <param name="pattern"></param>
        /// <param name="isCut"></param>
        /// <param name="result"></param>
        /// <returns></returns>
        public static bool GetMatchStr(string htmltext, string pattern, bool isCut, out string result)
        {
            bool IsGetSuccess = false;
            result = "";
            try
            {
                IsGetSuccess = GetMatchStr(htmltext, pattern, out result);
                if (!isCut)
                {
                    string[] replaceStrs = new string[2];
                    if (pattern.Contains("(.*?)"))
                    {
                        string splitStr = pattern.Replace("(.*?)", "|");
                        replaceStrs = splitStr.Split('|');
                    }
                    result = replaceStrs[0] + result + replaceStrs[1];
                }
            }
            catch (Exception ex)
            {
                IsGetSuccess = false;
            }

            return IsGetSuccess;
        }

        public static string GetMatchString(string htmltext, string pattern, bool isCut)
        {
          string    result = "";
            try
            {
                GetMatchStr(htmltext, pattern, out result);
                if (isCut)
                {
                    string[] replaceStrs = new string[2];
                    if (pattern.Contains("(.*?)"))
                    {
                        string splitStr = pattern.Replace("(.*?)", "|");
                        replaceStrs = splitStr.Split('|');
                    }
                    result = result.Replace(replaceStrs[0], "").Replace(replaceStrs[1], ""); 
                }
                return result;
            }
            catch (Exception ex)
            {
                return "";
            }
            
        }


        /// <summary>
        /// 正则表达式dan匹配方法
        /// </summary>
        /// <param name="htmltext">网页内容</param>
        /// <param name="pattern">模式字符串</param>
        /// <param name="result">返回匹配成功的字符串</param>
        /// <returns>匹配是否成功</returns>
        public static bool GetMatchStr(string htmltext, string pattern, out string result)
        {
            bool IsGetSuccess = false;
            result = "";
            try
            {
                string[] replaceStrs=new string[2];
                if (pattern.Contains("(.*?)"))
                {
                    string splitStr = pattern.Replace("(.*?)", "^");
                    replaceStrs = splitStr.Split('^');
                }
                Regex regex = new Regex(pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Match match = regex.Match(htmltext);
                if (match.Success)
                {
                    result = match.ToString();
                    result = result.Replace(replaceStrs[0], "").Replace(replaceStrs[1], "");
                }
                else
                {
                    IsGetSuccess = false;
                }
            }
            catch (Exception ex)
            {
                IsGetSuccess = false;
            }
            finally
            {
                if (!string.IsNullOrEmpty(result))
                {
                    IsGetSuccess = true;
                }
                else
                {
                    IsGetSuccess = false;
                }
            }
            return IsGetSuccess;
        }

        /// <summary>
        /// 正则多匹配,返回匹配ArrayList数组
        /// </summary>
        /// <param name="htmltext">网页内容</param>
        /// <param name="pattern">模式字符串</param>
        /// <returns></returns>
        public static ArrayList GetMathList(string htmltext, string pattern)
        {
            ArrayList list = new ArrayList();
            try
            {
                MatchCollection mc;
                //定义一个Regex对象实例 
                Regex regex = new Regex(pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase);
                //或者多行匹配模式RegexOptions.Multiline  
                mc = regex.Matches(htmltext);
                //在输入字符串中找到所有匹配
                for (int i = 0; i < mc.Count; i++)
                {
                    //匹配一条信息就处理
                    string groupcode = mc[i].Value.ToString();
                    //处理函数
                    list.Add(groupcode);
                }
            }
            catch (Exception)
            {
                return null;
            }
            return list;
        }

        ///// <summary>
        ///// 正则表达式duo匹配方法
        ///// </summary>
        ///// <param name="htmltext">网页内容</param>
        ///// <param name="patterns">模式字符串数组</param>
        ///// <param name="result">返回匹配成功的字符串</param>
        ///// <returns>匹配是否成功</returns>
        //public static bool GetMathStr(string htmltext, string[] patterns, out string result)
        //{
        //    bool IsGetSuccess = false;
        //    result = "";
        //    try
        //    {
        //        string temp = htmltext;
        //        for (int i = 0; i < patterns.Length; i++)
        //        {
        //            Regex regex = new Regex(patterns[i], RegexOptions.Singleline | RegexOptions.IgnoreCase);
        //            Match match = regex.Match(temp);
        //            if (match.Success)
        //            {
        //                temp = match.ToString();
        //                if (i == patterns.Length - 1)
        //                {
        //                    result = temp;
        //                }
        //            }
        //            else
        //            {
        //                break;
        //            }
        //        }
        //    }
        //    catch (Exception ex)
        //    {
        //        IsGetSuccess = false;
        //    }
        //    finally
        //    {
        //        if (!string.IsNullOrEmpty(result))
        //        {
        //            IsGetSuccess = true;
        //        }
        //        else
        //        {
        //            IsGetSuccess = false;
        //        }
        //    }
        //    return IsGetSuccess;
        //}
    }
}

 

posted @ 2016-09-28 09:49  0539  阅读(6269)  评论(2编辑  收藏  举报