C# 网络爬虫

 

概念知识

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
/// 网络爬虫:是个应用程序,自动提取网页的程序,
/// 通过数据筛选、过滤,得到有用的信息,一般是为搜索引擎服务或者作为内容来源
/// Application+WebRequest+Filter+Data+Threads
/// 为什么要这个?
/// 做个内容站--小说/电影/动漫---阿里云+爬虫+Web
/// 数据搜集爬虫---招标数据爬虫/淘宝数据/招聘信息
/// 竞品分析--抓取竞争对手数据
/// 爬虫违法吗?
/// 不问自取谓之偷;爬虫能拿到的信息都是浏览器能访问到的,就是公开数据;
/// 不要基于盈利(小爬虫都没事儿);360搜索引擎--被判赔偿--违背了robots
///
/// 爬虫攻防:
/// robot协议(道德防线):根域名/robots.txt 弱的约定
/// ---服务端请求信息(urlrefer/agent){爬虫模拟好就能突破}
/// ---用户登录{模拟请求带上cookie}
/// ---IP黑名单白名单{代理请求}
/// ---识别爬虫后定期返回验证码{换IP/打码平台}
/// ---js动态加载/动态修改/数据图片化{可以解决}
/// 爬虫:道高一尺魔高一丈,任何信息是无法阻止抓取的
///
/// Html下载----数据筛选清洗入库---多线程
///
/// 下载了一大堆的HTML,那么怎么去获取有用的信息呢?
/// 怎么去进一步下载更多的东西呢?
///
/// 数据筛选:正则(麻烦)/indexof+substring+replace/第三方工具包HtmlAgilityPack支持Xpath(本质是正则)
/// 安全控件--ActiveX
///
/// 1 深度抓取&批量数据高效匹配获取
/// 2 所见非所得属性获取
/// 3 Ajax数据的获取
/// 4 多线程抓取
///
/// 百度爬虫的规则:title keyword description +文字
///
/// 怎么样把京东全部的数据都抓到呢?
/// https://www.jd.com/allSort.aspx 作为入口找出全部类别
/// 然后一个一个类别抓取
/// 1000多个类别,就是1000多个任务,每个任务的耗时还不一样,
/// 如何用指定的线程数,去最高效的完成这些任务?
///
/// 类别数据的抓取
/// + 刚才那个bug
/// + 多线程任务分配

 1.获取Html(通过模拟http请求) 

http请求帮助类

复制代码
    /// <summary>
    /// http://tool.sufeinet.com/HttpHelper.aspx
    /// </summary>
    public class HttpHelper
    {
        private static Logger logger = new Logger(typeof(HttpHelper));

        /// <summary>
        /// 根据url下载内容  之前是GB2312
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static string DownloadUrl(string url)
        {
            return DownloadHtml(url, Encoding.UTF8);
        }

        //HttpClient--WebApi

        /// <summary>
        /// 下载html
        /// http://tool.sufeinet.com/HttpHelper.aspx
        /// HttpWebRequest功能比较丰富,WebClient使用比较简单
        /// WebRequest
        /// 
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static string DownloadHtml(string url, Encoding encode)
        {
            string html = string.Empty;
            try
            {

                //可以使用httpWebRequest  也可以使用HttpClent
                //https可以下载--

                //ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback((object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors) =>
                //{
                //    return true; //总是接受  
                //});
                //ServicePointManager.SecurityProtocol = SecurityProtocolType.Ssl3 | SecurityProtocolType.Tls12 | SecurityProtocolType.Tls11 | SecurityProtocolType.Tls;

                HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;//模拟请求
                request.Timeout = 30 * 1000;//设置30s的超时
                request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36";
                //request.UserAgent = "User - Agent:Mozilla / 5.0(iPhone; CPU iPhone OS 7_1_2 like Mac OS X) App leWebKit/ 537.51.2(KHTML, like Gecko) Version / 7.0 Mobile / 11D257 Safari / 9537.53";

                request.ContentType = "text/html; charset=utf-8";// "text/html;charset=gbk";// 
                 
                //request.Host = "search.yhd.com"; 
                //request.Headers.Add("Cookie", @"newUserFlag=1; guid=YFT7C9E6TMFU93FKFVEN7TEA5HTCF5DQ26HZ; gray=959782; cid=av9kKvNkAPJ10JGqM_rB_vDhKxKM62PfyjkB4kdFgFY5y5VO; abtest=31; _ga=GA1.2.334889819.1425524072; grouponAreaId=37; provinceId=20; search_showFreeShipping=1; rURL=http%3A%2F%2Fsearch.yhd.com%2Fc0-0%2Fkiphone%2F20%2F%3Ftp%3D1.1.12.0.73.Ko3mjRR-11-FH7eo; aut=5GTM45VFJZ3RCTU21MHT4YCG1QTYXERWBBUFS4; ac=57265177%40qq.com; msessionid=H5ACCUBNPHMJY3HCK4DRF5VD5VA9MYQW; gc=84358431%2C102362736%2C20001585%2C73387122; tma=40580330.95741028.1425524063040.1430288358914.1430790348439.9; tmd=23.40580330.95741028.1425524063040.; search_browse_history=998435%2C1092925%2C32116683%2C1013204%2C6486125%2C38022757%2C36224528%2C24281304%2C22691497%2C26029325; detail_yhdareas=""; cart_cookie_uuid=b64b04b6-fca7-423b-b2d1-ff091d17e5e5; gla=20.237_0_0; JSESSIONID=14F1F4D714C4EE1DD9E11D11DDCD8EBA; wide_screen=1; linkPosition=search");

                //request.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
                //request.Headers.Add("Accept-Encoding", "gzip, deflate, sdch");
                //request.Headers.Add("Referer", "http://list.yhd.com/c0-0/b/a-s1-v0-p1-price-d0-f0-m1-rt0-pid-mid0-kiphone/");

                //Encoding enc = Encoding.GetEncoding("GB2312"); // 如果是乱码就改成 utf-8 / GB2312

                //如何自动读取cookie
                request.CookieContainer = new CookieContainer();//1 给请求准备个container
                using (HttpWebResponse response = request.GetResponse() as HttpWebResponse)//发起请求
                {
                    if (response.StatusCode != HttpStatusCode.OK)
                    {
                        logger.Warn(string.Format("抓取{0}地址返回失败,response.StatusCode为{1}", url, response.StatusCode));
                    }
                    else
                    {
                        try
                        {
                            //string sessionValue = response.Cookies["ASP.NET_SessionId"].Value;//2 读取cookie
                            StreamReader sr = new StreamReader(response.GetResponseStream(), encode);
                            html = sr.ReadToEnd();//读取数据
                            sr.Close();
                        }
                        catch (Exception ex)
                        {
                            logger.Error(string.Format($"DownloadHtml抓取{url}失败"), ex);
                            html = null;
                        }
                    }
                }
            }
            catch (System.Net.WebException ex)
            {
                if (ex.Message.Equals("远程服务器返回错误: (306)。"))
                {
                    logger.Error("远程服务器返回错误: (306)。", ex);
                    html = null;
                }
            }
            catch (Exception ex)
            {
                logger.Error(string.Format("DownloadHtml抓取{0}出现异常", url), ex);
                html = null;
            }
            return html;
        }


        public static string DownloadJsonData(string url, Encoding encode)
        {
            string html = string.Empty;
            try
            {
                HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;//模拟请求
                request.Timeout = 30 * 1000;//设置30s的超时
                request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36";

                request.ContentType = "application/json;charset=utf-8";
                request.Referer = "https://ke.qq.com/admin/index.html";
                request.Accept = "*/*";


                request.Headers.Add("authority", "ke.qq.com");
                request.Headers.Add("method", "GET");
                request.Headers.Add("path", "/cgi-agency/agency/manager/get_course_list?aid=80207&count=10&page=1&pay_type=2&only_owner=1&cname_key=&course_state=0&signup_state=0&sort_method=0&is_ascending=0&bkn=449651946&t=0.0029");

                request.Headers.Add("scheme", "https");
                request.Headers.Add("accept-encodin", "gzip, deflate, br");
                request.Headers.Add("accept-language", "zh-CN,zh;q=0.9");
                request.Headers.Add("sec-fetch-mode", "cors");
                request.Headers.Add("sec-fetch-site", "same-origin");
                 
                request.CookieContainer = new CookieContainer();//1 给请求准备个container
                request.Headers.Add("cookie", @"pgv_pvi=5087136768; RK=WXbobkaNsr; ptcz=447bb81ae5a54414d650b3476dced39f0cb80683ac471ec83f6cd827e4fdf5b7; pgv_pvid=3854687776; ts_uid=2094536335; localInterest=[2002]; iswebp=1; pac_uid=1_2751435708; ied_qq=o2751435708; isHideDealTips=1; ts_refer=pay.qq.com/enterprise/separate.shtml; tvfe_boss_uuid=86ed223020e11b4d; o_cookie=2751435708; ke_login_type=1; luin=o2751435708; course_origin=[{'cid':297038,'ext':{'pagelocation':'list, 1.3'}}]; tdw_data_testid=; tdw_data_flowid=; tdw_auin_data=-; tdw_first_visited=1; pgv_info=ssid=s6817397715; Hm_lvt_0c196c536f609d373a16d246a117fd44=1575856560,1575893514,1575896171,1575951173; index_new_key={'index_interest_cate_id':2002}; _qpsvr_localtk=0.05340837932628206; pgv_si=s9919754240; miniapp_qrcode_id=0115055f128549d785b0b1593b6084e4; uin=o2751435708; skey=@iGqJqPF6r; ptisp=cnc; lskey=000100005c4c2b2b6499fad1306a5536e639f4503825b33a2f4ee7f6c8ad4b5d43621199c8c99163c07c78c0; p_uin=o2751435708; pt4_token=SXdux*M4NNFzFRs6r59i-wKIVO4xk1AofSqAo8pvGg8_; p_skey=B4sUwtIxpxoGHY7OrQue0wyl2KkTlW4-Jr0MPc7Vf88_; p_luin=o2751435708; p_lskey=000400007af2a05b862ff9c18913d8cb474fb1c0c9f957de9f12b024a4372a5544cd95ecca989efba4edd06a; _pathcode=0.8704610796361238; tdw_data_sessionid=157595933804520618739236; ts_last=ke.qq.com/course/list; Hm_lpvt_0c196c536f609d373a16d246a117fd44=1575959482; tdw_data_new_2={'auin':' - ','sourcetype':'tuin','sourcefrom':'a3ff93bc','ver9':2751435708,'uin':2751435708,'visitor_id':'8882822910493176','url_page':'index','url_module':'searchbar','url_position':''}; tdw_data={'ver4':'4','ver5':'','ver6':'','refer':'','from_channel':'','path':'aBar - 0.8704610796361238','auin':' - ','uin':2751435708,'real_uin':2751435708}");

                //如何自动读取cookie 
                using (HttpWebResponse response = request.GetResponse() as HttpWebResponse)//发起请求
                {
                    if (response.StatusCode != HttpStatusCode.OK)
                    {
                        logger.Warn(string.Format("抓取{0}地址返回失败,response.StatusCode为{1}", url, response.StatusCode));
                    }
                    else
                    {
                        try
                        {
                            //string sessionValue = response.Cookies["ASP.NET_SessionId"].Value;//2 读取cookie
                            StreamReader sr = new StreamReader(response.GetResponseStream(), encode);
                            html = sr.ReadToEnd();//读取数据
                            sr.Close();
                        }
                        catch (Exception ex)
                        {
                            logger.Error(string.Format($"DownloadHtml抓取{url}失败"), ex);
                            html = null;
                        }
                    }
                }
            }
            catch (System.Net.WebException ex)
            {
                if (ex.Message.Equals("远程服务器返回错误: (306)。"))
                {
                    logger.Error("远程服务器返回错误: (306)。", ex);
                    html = null;
                }
            }
            catch (Exception ex)
            {
                logger.Error(string.Format("DownloadHtml抓取{0}出现异常", url), ex);
                html = null;
            }
            return html;
        }

    }
复制代码

调用

 string url = $"{Constant.TencentClassUrl}/course/list/?tuin=7e4f8b7d";
 string html = HttpHelper.DownloadUrl(url);

  2. 筛选,过滤,获取有用的信息

用HtmlAgilityPack读取HTMl内容

  HtmlDocument doc = new HtmlDocument();
  doc.LoadHtml(html);

使用XPath筛选HMTL的元素节点,获取有用信息

复制代码
            string path = "//*[@id='auto-test-1']/div[1]/dl/dd";
            HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(path);

            foreach (HtmlNode node in nodeList)
            {
                HtmlDocument htmlDocument = new HtmlDocument();
                htmlDocument.LoadHtml(node.InnerHtml);

                string codePath = "//a";
                HtmlNode codeNode = htmlDocument.DocumentNode.SelectSingleNode(codePath);
                string href = codeNode.Attributes["href"].Value;
                if (!string.IsNullOrWhiteSpace(href))
                {
                    href = href.Replace(";", "&");
                }

                string code = string.Empty;
                if (href != null && href.IndexOf("st=") != -1)
                {
                    href = href.Replace(";", "&");
                    code = href.Substring(href.IndexOf("st=") + 3, 4);
                }
                TencentCategoryEntity category = new TencentCategoryEntity()
                {
                    Id = _Count++,
                    State = 1,
                    CategoryLevel = 2,
                    Code = code,
                    ParentCode = parentCode
                };
                category.Name = codeNode.InnerText;
                category.Url = href;

                categoryList.Add(category);

                if (codeNode.InnerText != "全部")
                {
                    categoryList.AddRange(this.Third($"{Constant.TencentClassUrl}{href}&tuin=7e4f8b7d", code));
                }
            }
            return categoryList;
复制代码

   3. 数据汇总,入库

获取某一页的数据

复制代码
 public void Show(string url)
        {
            string strHtml = HttpHelper.DownloadUrl(url);
            HtmlDocument document = new HtmlDocument();
            document.LoadHtml(strHtml);
            string liPath = "/html/body/section[1]/div/div[@class='market-bd market-bd-6 course-list course-card-list-multi-wrap js-course-list']/ul/li";
            HtmlNodeCollection liNodes = document.DocumentNode.SelectNodes(liPath);
            foreach (var node in liNodes)
            {
                Console.WriteLine();
                Console.WriteLine();
                Console.WriteLine("************************************************");
                HtmlDocument lidocument = new HtmlDocument();
                lidocument.LoadHtml(node.OuterHtml);
                string aPath = "//*/a[1]";
                HtmlNode classANode = lidocument.DocumentNode.SelectSingleNode(aPath);
                string aHref = classANode.Attributes["href"].Value;

                Console.WriteLine($"课程Url:{aHref}");

                string Id = classANode.Attributes["data-id"].Value;

                Console.WriteLine($"课程Id:{Id}");

                string imgPath = "//*/a[1]/img";
                HtmlNode imgNode = lidocument.DocumentNode.SelectSingleNode(imgPath);
                string imgUrl = imgNode.Attributes["src"].Value;

                Console.WriteLine($"ImageUrl:{imgUrl}");

                string namePaths = "//*/h4/a[1]";
                HtmlNode nameNode = lidocument.DocumentNode.SelectSingleNode(namePaths);
                string name = nameNode.InnerText;
                Console.WriteLine(name);

                Console.WriteLine($"课程名称:{name}");
                // courseEntity.Price = new Random().Next(100, 10000);  //关于腾讯课堂上的课程价格抓取 这是一个进阶内容  通过普通方式搞不了(他有一个自己的算法) 

                count = count + 1;
            }
        }
复制代码
复制代码
    public void ShowPageData(string url)
        {
            string strHtml = HttpHelper.DownloadUrl(url);
            HtmlDocument document = new HtmlDocument();
            document.LoadHtml(strHtml);
            string pagePath = "/html/body/section[1]/div/div[5]/a[@class='page-btn']";
            HtmlNodeCollection pageNodes = document.DocumentNode.SelectNodes(pagePath);
            int maxPage = pageNodes.Select(p => int.Parse(p.InnerText)).Max();
            for (int page = 1; page <= maxPage; page++)
            {
                string pageUrl = $"{url}&page={page}";
                Show(pageUrl);
            }
            Console.WriteLine($"一共抓取数据{count}条");
        }
复制代码

分页获取

复制代码
  #region 分页抓取 
        private void GetPageCourseData()
        {
            //1. 确定总页数
            //2. 分别抓取每一页的数据
            //3. 分析  过滤  清洗
            //4. 入库 

            category.Url = $"{Constant.TencentClassUrl}{category.Url}";

            string strHtml = HttpHelper.DownloadUrl(category.Url);
            HtmlDocument document = new HtmlDocument();
            document.LoadHtml(strHtml);
            //Xpath
            string pagePath = "/html/body/section[1]/div/div[@class='sort-page']/a[@class='page-btn']";
            HtmlNodeCollection pageNodes = document.DocumentNode.SelectNodes(pagePath);

            int pageCount = 1;
            if (pageNodes != null)
            {
                pageCount = pageNodes.Select(a => int.Parse(a.InnerText)).Max();
            }
            List<CourseEntity> courseList = new List<CourseEntity>();

            for (int pageIndex = 1; pageIndex <= pageCount; pageIndex++)
            {
                Console.WriteLine($"******************************当前是第{pageIndex}页数据************************************");
                string pageIndexUrl = $"{category.Url}&page={pageIndex}";
                List<CourseEntity> courseEntities = GetPageIndeData(pageIndexUrl);
                courseList.AddRange(courseEntities);
            }
            courseRepository.SaveList(courseList);


        }

        private List<CourseEntity> GetPageIndeData(string url)
        {
            //获取li标签里面的数据 
            // 先获取所有的Li 
            //  然后循环获取li中的有效数据
            string strHtml = HttpHelper.DownloadUrl(url);
            HtmlDocument document = new HtmlDocument();
            document.LoadHtml(strHtml);
            string liPath = "/html/body/section[1]/div/div[@class='market-bd market-bd-6 course-list course-card-list-multi-wrap js-course-list']/ul/li";
            HtmlNodeCollection liNodes = document.DocumentNode.SelectNodes(liPath);

            List<CourseEntity> courseEntities = new List<CourseEntity>();
            foreach (var node in liNodes)
            {
                CourseEntity courseEntity = GetLiData(node);
                courseEntities.Add(courseEntity);
            }
            return courseEntities;
        }

        /// <summary>
        /// 当我们把这些数据获取到以后,那就应该保存起来
        /// </summary>
        /// <param name="node"></param>
        private CourseEntity GetLiData(HtmlNode node)
        {
            CourseEntity courseEntity = new CourseEntity();
            //从这里开始 
            HtmlDocument document = new HtmlDocument();
            document.LoadHtml(node.OuterHtml);
            string aPath = "//*/a[1]";
            HtmlNode classANode = document.DocumentNode.SelectSingleNode(aPath);
            string aHref = classANode.Attributes["href"].Value;
            courseEntity.Url = aHref;

            Console.WriteLine($"课程Url:{aHref}");

            string Id = classANode.Attributes["data-id"].Value;

            Console.WriteLine($"课程Id:{Id}");

            courseEntity.CourseId = long.Parse(Id);

            string imgPath = "//*/a[1]/img";
            HtmlNode imgNode = document.DocumentNode.SelectSingleNode(imgPath);
            string imgUrl = imgNode.Attributes["src"].Value;
            courseEntity.ImageUrl = imgUrl;

            Console.WriteLine($"ImageUrl:{imgUrl}");

            string namePaths = "//*/h4/a[1]";
            HtmlNode nameNode = document.DocumentNode.SelectSingleNode(namePaths);
            string name = nameNode.InnerText;

            courseEntity.Title = name;

            Console.WriteLine($"课程名称:{name}");

            courseEntity.Price = new Random().Next(100, 10000);  //关于腾讯课堂上的课程价格抓取 这是一个进阶内容  通过普通方式搞不了(他有一个自己的算法) 
            return courseEntity;

        }
        #endregion
复制代码

入库操作

复制代码
   //分表之后, 以相对均匀的数据量存储到 不同的数据表中  
        //这是为了提高数据库的性能,其实是降低每一张表的,数据操作压力  
        //如果爬虫爬取的是亿万级数据,这种分表的方式是有必要的  
        public void SaveList(List<CourseEntity> courseList)
        {
            if (courseList == null || courseList.Count == 0) return;
            IEnumerable<IGrouping<string, CourseEntity>> group = courseList.GroupBy<CourseEntity, string>(c => GetTableName(c));

            foreach (var data in group)
            {
                SqlHelper.InsertList<CourseEntity>(data.ToList(), data.Key);
            }
        }

        private string GetTableName(CourseEntity course)
        {
            return string.Format("Tencent_Subject_{0}", (course.CourseId % 30 + 1).ToString("000"));
        }
复制代码
复制代码
    public class SqlHelper
    {
        private static Logger logger = new Logger(typeof(SqlHelper));
        private static string _ConnStr = ConfigurationManager.ConnectionStrings["conn"].ConnectionString;

        /// <summary>
        /// 事务执行
        /// </summary>
        /// <param name="sql"></param>
        public static void ExecuteNonQuery(string sql)
        {
            using (SqlConnection sqlConn = new SqlConnection(_ConnStr))
            {
                sqlConn.Open();
                SqlCommand cmd = new SqlCommand(sql, sqlConn);
                cmd.ExecuteNonQuery();//.ExecuteNonQueryAsync();//
            }
        }

        public static void ExecuteNonQueryWithTrans(string sql)
        {
            SqlTransaction trans = null;
            try
            {
                using (SqlConnection sqlConn = new SqlConnection(_ConnStr))
                {
                    sqlConn.Open();
                    trans = sqlConn.BeginTransaction();
                    SqlCommand cmd = new SqlCommand(sql, sqlConn, trans);
                    cmd.ExecuteNonQuery();//.ExecuteNonQueryAsync();//
                    trans.Commit();
                }
            }
            catch (Exception ex)
            {
                //logger.Error(string.Format("ExecuteNonQueryWithTrans出现异常,sql={0}", sql), ex);
                if (trans != null && trans.Connection != null)
                    trans.Rollback();
                throw ex;
            }
            finally
            {
            }
        }

        public static List<T> QueryList<T>(string sql) where T : new()
        {
            using (SqlConnection sqlConn = new SqlConnection(_ConnStr))
            {
                sqlConn.Open();
                SqlCommand cmd = new SqlCommand(sql, sqlConn);
                return TransList<T>(cmd.ExecuteReader());
            }
        }

        public static void Insert<T>(T model, string tableName) where T : new()
        {
            string sql = GetInsertSql<T>(model, tableName);
            ExecuteNonQuery(sql);
        }

        public static void InsertList<T>(List<T> list, string tableName) where T : new()
        {
            string sql = string.Join(" ", list.Select(t => GetInsertSql<T>(t, tableName)));
            ExecuteNonQuery(sql);
        }

        #region Private
        private static string GetInsertSql<T>(T model, string tableName)
        {
            StringBuilder sbSql = new StringBuilder();

            StringBuilder sbFields = new StringBuilder();
            StringBuilder sbValues = new StringBuilder();

            Type type = model.GetType();
            var properties = type.GetProperties();
            foreach (PropertyInfo p in properties)
            {
                string name = p.Name;
                if (!name.Equals("id", StringComparison.OrdinalIgnoreCase))
                {
                    sbFields.AppendFormat("[{0}],", name);
                    string sValue = null;
                    object oValue = p.GetValue(model);
                    if (oValue != null)
                        sValue = oValue.ToString().Replace("'", "");
                    sbValues.AppendFormat("'{0}',", sValue);
                }
            }
            sbSql.AppendFormat("INSERT INTO {0} ({1}) VALUES ({2});", tableName, sbFields.ToString().TrimEnd(','), sbValues.ToString().TrimEnd(','));
            return sbSql.ToString();
        }

        private static List<T> TransList<T>(SqlDataReader reader) where T : new()
        {
            List<T> tList = new List<T>();
            Type type = typeof(T);
            var properties = type.GetProperties();
            if (reader.Read())
            {
                do
                {
                    T t = new T();
                    foreach (PropertyInfo p in properties)
                    {
                        p.SetValue(t, Convert.ChangeType(reader[p.Name], p.PropertyType));
                    }
                    tList.Add(t);
                }
                while (reader.Read());
            }
            return tList;
        }

        private static T TransModel<T>(SqlDataReader reader) where T : new()
        {
            T t = new T();
            if (reader.Read())
            {
                do
                {
                    Type type = typeof(T);
                    var properties = type.GetProperties();
                    foreach (PropertyInfo p in properties)
                    {
                        p.SetValue(t, Convert.ChangeType(reader[p.Name], p.PropertyType));
                    }
                }
                while (reader.Read());
            }
            return t;
        }
        #endregion Private
    }
复制代码

4. 多线程抓取数据,入库

复制代码
    public class CrawlerCenter
    {
        private static Logger logger = new Logger(typeof(CrawlerCenter));

        /// <summary>
        /// 抓取
        /// </summary>
        public static void Handler()
        {
            Console.WriteLine("请输入Y/N进行类别表初始化确认! Y 删除Tencent_Category表然后重新创建,然后抓取类型数据,N(或者其他)跳过");
            string input = Console.ReadLine();
            if (input.Equals("Y", StringComparison.OrdinalIgnoreCase))
            {
                DBInit.InitCategoryTable();
                CrawlerCategory();
            }
            else
            {
                Console.WriteLine("你选择不初始化类别数据");
            }
            Console.WriteLine("*****************^_^**********************");


            Console.WriteLine("请输入Y/N进行课程数据初始化确认! Y 删除全部课程数据表然后重新创建,然后抓取课程数据,N(或者其他)跳过");
            input = Console.ReadLine();
            if (input.Equals("Y", StringComparison.OrdinalIgnoreCase))
            {
                DBInit.InitTencentSubjectTable();
                CrawlerCourse();
            }
            Console.WriteLine("*****************^_^**********************");
        }

        private static void CrawlerCategory()
        {
            Console.WriteLine($"{ DateTime.Now} 腾讯课堂类目开始抓取 - -");
            ISearch search = new CategorySearch();
            search.Crawler();
        }

        /// <summary> 
        /// 
        /// </summary>
        public static void CrawlerCourse()
        {
            Console.WriteLine($"{ DateTime.Now} 腾讯课堂课程开始抓取 - -");
            CategoryRepository categoryRepository = new CategoryRepository();

            //先从数据库拿到第三级类目信息
            List<TencentCategoryEntity> categoryList = categoryRepository.QueryListByLevel(3);

            var categoryquery = categoryList.Where(a => !a.Name.Contains("全部"));
 
            //TaskFactory taskFactory = Task.Factory; 
            //List<Task> taskList = new List<Task>();
            //foreach (TencentCategoryEntity category in categoryquery)
            //{
            //    ISearch searcher = new CourseSearch(category);
            //    taskList.Add(taskFactory.StartNew(searcher.Crawler));
            //    if (taskList.Count >= 10)  //当前最多开启10个线程
            //    {
            //        taskList = taskList.Where(t => !t.IsCompleted && !t.IsCanceled && !t.IsFaulted).ToList();
            //        Task.WaitAny(taskList.ToArray());//表示等待一个线程完成之后继续往后 
            //    }
            //}
            //Task.WaitAll(taskList.ToArray());

            List<Action> actions = new List<Action>();
            foreach (TencentCategoryEntity category in categoryquery)
            {
                ISearch searcher = new CourseSearch(category);
                actions.Add(new Action(searcher.Crawler));
            }
            //需要控制数量 
            ParallelOptions options = new ParallelOptions();
            options.MaxDegreeOfParallelism = 10;  //控制了最大的线程数量是10 个线程
            Parallel.Invoke(options, actions.ToArray()); //Parallel 表示有多少个委托  就开启多少个线程

            Console.WriteLine($"{ DateTime.Now} 腾讯课堂所有课程抓取完毕 - -"); 
        }

        /// <summary>
        /// 清理重复数据
        /// </summary>
        private static void CleanAll()
        {
            try
            {
                Console.WriteLine($"{ DateTime.Now} 开始清理重复数据 - -");
                StringBuilder sb = new StringBuilder();
                for (int i = 1; i < 31; i++)
                {
                    sb.AppendFormat(@"DELETE FROM [dbo].[JD_Commodity_{0}] where productid IN(select productid from [dbo].[JD_Commodity_{0}] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_{0}] group by productid,CategoryId having count(0)>1);", i.ToString("000"));
                }
                #region
                /*
                 DELETE FROM [dbo].[JD_Commodity_001] where productid IN(select productid from [dbo].[JD_Commodity_001] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_001] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_002] where productid IN(select productid from [dbo].[JD_Commodity_002] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_002] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_003] where productid IN(select productid from [dbo].[JD_Commodity_003] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_003] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_004] where productid IN(select productid from [dbo].[JD_Commodity_004] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_004] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_005] where productid IN(select productid from [dbo].[JD_Commodity_005] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_005] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_006] where productid IN(select productid from [dbo].[JD_Commodity_006] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_006] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_007] where productid IN(select productid from [dbo].[JD_Commodity_007] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as IDv from [dbo].[JD_Commodity_007] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_008] where productid IN(select productid from [dbo].[JD_Commodity_008] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_008] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_009] where productid IN(select productid from [dbo].[JD_Commodity_009] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_009] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_010] where productid IN(select productid from [dbo].[JD_Commodity_010] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_010] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_011] where productid IN(select productid from [dbo].[JD_Commodity_011] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_011] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_012] where productid IN(select productid from [dbo].[JD_Commodity_012] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_012] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_013] where productid IN(select productid from [dbo].[JD_Commodity_013] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_013] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_014] where productid IN(select productid from [dbo].[JD_Commodity_014] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_014] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_015] where productid IN(select productid from [dbo].[JD_Commodity_015] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_015] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_016] where productid IN(select productid from [dbo].[JD_Commodity_016] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_016] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_017] where productid IN(select productid from [dbo].[JD_Commodity_017] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_017] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_018] where productid IN(select productid from [dbo].[JD_Commodity_018] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_018] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_019] where productid IN(select productid from [dbo].[JD_Commodity_019] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_019] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_020] where productid IN(select productid from [dbo].[JD_Commodity_020] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_020] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_021] where productid IN(select productid from [dbo].[JD_Commodity_021] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_021] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_022] where productid IN(select productid from [dbo].[JD_Commodity_022] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_022] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_023] where productid IN(select productid from [dbo].[JD_Commodity_023] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_023] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_024] where productid IN(select productid from [dbo].[JD_Commodity_024] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_024] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_025] where productid IN(select productid from [dbo].[JD_Commodity_025] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_025] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_026] where productid IN(select productid from [dbo].[JD_Commodity_026] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_026] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_027] where productid IN(select productid from [dbo].[JD_Commodity_027] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_027] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_028] where productid IN(select productid from [dbo].[JD_Commodity_028] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_028] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_029] where productid IN(select productid from [dbo].[JD_Commodity_029] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_029] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_030] where productid IN(select productid from [dbo].[JD_Commodity_030] group by productid,CategoryId having count(0)>1)
                                AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_030] group by productid,CategoryId having count(0)>1);
                 */
                #endregion
                Console.WriteLine("执行清理sql:{0}", sb.ToString());
                SqlHelper.ExecuteNonQuery(sb.ToString());
                Console.WriteLine("{0} 完成清理重复数据 - -", DateTime.Now);
            }
            catch (Exception ex)
            {
                logger.Error("CleanAll出现异常", ex);
            }
            finally
            {
                Console.WriteLine("{0} 结束清理重复数据 - -", DateTime.Now);
            }
        }
    }
复制代码
复制代码
        /// <summary>
        /// 谨慎使用  会全部删除数据库并重新创建!
        /// </summary>
        public static void InitCategoryTable()
        {
            #region Delete
            try
            {
                StringBuilder sb = new StringBuilder();
                sb.AppendFormat("DROP TABLE [dbo].[Tencent_Category];");
                SqlHelper.ExecuteNonQuery(sb.ToString());
            }
            catch (Exception ex)
            {
                if (ex.Message.Equals("无法对 表 'dbo.Tencent_Category' 执行 删除,因为它不存在,或者您没有所需的权限。"))
                {
                    logger.Warn("初始化数据库InitCategoryTable删除的时候,原表不存在");
                }
                else
                {
                    logger.Error("初始化数据库InitTencent_CategoryTable失败", ex);
                    throw ex;
                }
            }
            #endregion Delete

            #region Create
            try
            {
                StringBuilder sb = new StringBuilder();
                sb.AppendFormat(@"CREATE TABLE [dbo].[Tencent_Category](
                                        [Id] [int] IDENTITY(1,1) NOT NULL,
                                        [Code] [varchar](100) NULL,
                                        [ParentCode] [varchar](100) NULL,
                                        [CategoryLevel] [int] NULL,
                                        [Name] [nvarchar](50) NULL,
                                        [Url] [varchar](1000) NULL,
                                        [State] [int] NULL,
                                      CONSTRAINT [PK_Tencent_Category] PRIMARY KEY CLUSTERED 
                                     (
                                         [Id] ASC
                                     )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
                                     ) ON [PRIMARY];");

                SqlHelper.ExecuteNonQuery(sb.ToString());
            }
            catch (Exception ex)
            {
                logger.Error("初始化数据库InitTencent_Category Table 创建失败", ex);
                throw ex;
            }
            #endregion Create 
        }
复制代码
复制代码
       /// <summary>
        /// 谨慎使用  会全部删除数据库并重新创建!
        /// </summary>
        public static void InitTencentSubjectTable()
        {
            #region Delete
            try
            {
                StringBuilder sb = new StringBuilder();
                for (int i = 1; i < 31; i++)
                {
                    sb.AppendFormat("DROP TABLE [dbo].[Tencent_Subject_{0}];", i.ToString("000"));
                }
                SqlHelper.ExecuteNonQuery(sb.ToString());
            }
            catch (Exception ex)
            {
                if (ex.Message.Contains("因为它不存在,或者您没有所需的权限。"))
                {
                    logger.Warn("初始化数据库InitTencent_SubjectTable删除的时候,原表不存在");
                }
                else
                {
                    logger.Error("初始化数据库InitTencent_SubjectTable失败", ex);
                    throw ex;
                }
            }
            #endregion Delete

            #region Create
            try
            {
                // 直接定义了30张表
                StringBuilder sb = new StringBuilder();
                for (int i = 1; i < 31; i++)
                {
                    sb.AppendFormat(@"CREATE TABLE [dbo].[Tencent_Subject_{0}](
                                        [Id] [int] IDENTITY(1,1) NOT NULL,
                                        [CourseId] [bigint] NULL,
                                        [CategoryId] [int] NULL,
                                        [Title] [nvarchar](500) NULL,
                                        [Price] [decimal](18, 2) NULL,
                                        [Url] [varchar](1000) NULL,
                                        [ImageUrl] [varchar](1000) NULL,
                             CONSTRAINT [PK_Tencent_Subject_{0}] PRIMARY KEY CLUSTERED 
                            (
                                [Id] ASC
                            )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
                            ) ON [PRIMARY];", i.ToString("000"));
                }
                SqlHelper.ExecuteNonQuery(sb.ToString());
            }
            catch (Exception ex)
            {
                logger.Error("InitInitTencent_SubjectTable创建异常", ex);
                throw ex;
            }
            #endregion Create
        }
复制代码

 

posted @   明志德道  阅读(746)  评论(1编辑  收藏  举报
编辑推荐:
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 一个奇形怪状的面试题:Bean中的CHM要不要加volatile?
· [.NET]调用本地 Deepseek 模型
· 一个费力不讨好的项目,让我损失了近一半的绩效!
阅读排行:
· 在鹅厂做java开发是什么体验
· 百万级群聊的设计实践
· WPF到Web的无缝过渡:英雄联盟客户端的OpenSilver迁移实战
· 永远不要相信用户的输入:从 SQL 注入攻防看输入验证的重要性
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
点击右上角即可分享
微信分享提示