C# 网络爬虫
概念知识
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | /// 网络爬虫:是个应用程序,自动提取网页的程序, /// 通过数据筛选、过滤,得到有用的信息,一般是为搜索引擎服务或者作为内容来源 /// Application+WebRequest+Filter+Data+Threads /// 为什么要这个? /// 做个内容站--小说/电影/动漫---阿里云+爬虫+Web /// 数据搜集爬虫---招标数据爬虫/淘宝数据/招聘信息 /// 竞品分析--抓取竞争对手数据 /// 爬虫违法吗? /// 不问自取谓之偷;爬虫能拿到的信息都是浏览器能访问到的,就是公开数据; /// 不要基于盈利(小爬虫都没事儿);360搜索引擎--被判赔偿--违背了robots /// /// 爬虫攻防: /// robot协议(道德防线):根域名/robots.txt 弱的约定 /// ---服务端请求信息(urlrefer/agent){爬虫模拟好就能突破} /// ---用户登录{模拟请求带上cookie} /// ---IP黑名单白名单{代理请求} /// ---识别爬虫后定期返回验证码{换IP/打码平台} /// ---js动态加载/动态修改/数据图片化{可以解决} /// 爬虫:道高一尺魔高一丈,任何信息是无法阻止抓取的 /// /// Html下载----数据筛选清洗入库---多线程 /// /// 下载了一大堆的HTML,那么怎么去获取有用的信息呢? /// 怎么去进一步下载更多的东西呢? /// /// 数据筛选:正则(麻烦)/indexof+substring+replace/第三方工具包HtmlAgilityPack支持Xpath(本质是正则) /// 安全控件--ActiveX /// /// 1 深度抓取&批量数据高效匹配获取 /// 2 所见非所得属性获取 /// 3 Ajax数据的获取 /// 4 多线程抓取 /// /// 百度爬虫的规则:title keyword description +文字 /// /// 怎么样把京东全部的数据都抓到呢? /// https://www.jd.com/allSort.aspx 作为入口找出全部类别 /// 然后一个一个类别抓取 /// 1000多个类别,就是1000多个任务,每个任务的耗时还不一样, /// 如何用指定的线程数,去最高效的完成这些任务? /// /// 类别数据的抓取 /// + 刚才那个bug /// + 多线程任务分配 |
1.获取Html(通过模拟http请求)
http请求帮助类
/// <summary> /// http://tool.sufeinet.com/HttpHelper.aspx /// </summary> public class HttpHelper { private static Logger logger = new Logger(typeof(HttpHelper)); /// <summary> /// 根据url下载内容 之前是GB2312 /// </summary> /// <param name="url"></param> /// <returns></returns> public static string DownloadUrl(string url) { return DownloadHtml(url, Encoding.UTF8); } //HttpClient--WebApi /// <summary> /// 下载html /// http://tool.sufeinet.com/HttpHelper.aspx /// HttpWebRequest功能比较丰富,WebClient使用比较简单 /// WebRequest /// /// </summary> /// <param name="url"></param> /// <returns></returns> public static string DownloadHtml(string url, Encoding encode) { string html = string.Empty; try { //可以使用httpWebRequest 也可以使用HttpClent //https可以下载-- //ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback((object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors) => //{ // return true; //总是接受 //}); //ServicePointManager.SecurityProtocol = SecurityProtocolType.Ssl3 | SecurityProtocolType.Tls12 | SecurityProtocolType.Tls11 | SecurityProtocolType.Tls; HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;//模拟请求 request.Timeout = 30 * 1000;//设置30s的超时 request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36"; //request.UserAgent = "User - Agent:Mozilla / 5.0(iPhone; CPU iPhone OS 7_1_2 like Mac OS X) App leWebKit/ 537.51.2(KHTML, like Gecko) Version / 7.0 Mobile / 11D257 Safari / 9537.53"; request.ContentType = "text/html; charset=utf-8";// "text/html;charset=gbk";// //request.Host = "search.yhd.com"; //request.Headers.Add("Cookie", @"newUserFlag=1; guid=YFT7C9E6TMFU93FKFVEN7TEA5HTCF5DQ26HZ; gray=959782; cid=av9kKvNkAPJ10JGqM_rB_vDhKxKM62PfyjkB4kdFgFY5y5VO; abtest=31; _ga=GA1.2.334889819.1425524072; grouponAreaId=37; provinceId=20; search_showFreeShipping=1; rURL=http%3A%2F%2Fsearch.yhd.com%2Fc0-0%2Fkiphone%2F20%2F%3Ftp%3D1.1.12.0.73.Ko3mjRR-11-FH7eo; aut=5GTM45VFJZ3RCTU21MHT4YCG1QTYXERWBBUFS4; ac=57265177%40qq.com; msessionid=H5ACCUBNPHMJY3HCK4DRF5VD5VA9MYQW; gc=84358431%2C102362736%2C20001585%2C73387122; tma=40580330.95741028.1425524063040.1430288358914.1430790348439.9; tmd=23.40580330.95741028.1425524063040.; search_browse_history=998435%2C1092925%2C32116683%2C1013204%2C6486125%2C38022757%2C36224528%2C24281304%2C22691497%2C26029325; detail_yhdareas=""; cart_cookie_uuid=b64b04b6-fca7-423b-b2d1-ff091d17e5e5; gla=20.237_0_0; JSESSIONID=14F1F4D714C4EE1DD9E11D11DDCD8EBA; wide_screen=1; linkPosition=search"); //request.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); //request.Headers.Add("Accept-Encoding", "gzip, deflate, sdch"); //request.Headers.Add("Referer", "http://list.yhd.com/c0-0/b/a-s1-v0-p1-price-d0-f0-m1-rt0-pid-mid0-kiphone/"); //Encoding enc = Encoding.GetEncoding("GB2312"); // 如果是乱码就改成 utf-8 / GB2312 //如何自动读取cookie request.CookieContainer = new CookieContainer();//1 给请求准备个container using (HttpWebResponse response = request.GetResponse() as HttpWebResponse)//发起请求 { if (response.StatusCode != HttpStatusCode.OK) { logger.Warn(string.Format("抓取{0}地址返回失败,response.StatusCode为{1}", url, response.StatusCode)); } else { try { //string sessionValue = response.Cookies["ASP.NET_SessionId"].Value;//2 读取cookie StreamReader sr = new StreamReader(response.GetResponseStream(), encode); html = sr.ReadToEnd();//读取数据 sr.Close(); } catch (Exception ex) { logger.Error(string.Format($"DownloadHtml抓取{url}失败"), ex); html = null; } } } } catch (System.Net.WebException ex) { if (ex.Message.Equals("远程服务器返回错误: (306)。")) { logger.Error("远程服务器返回错误: (306)。", ex); html = null; } } catch (Exception ex) { logger.Error(string.Format("DownloadHtml抓取{0}出现异常", url), ex); html = null; } return html; } public static string DownloadJsonData(string url, Encoding encode) { string html = string.Empty; try { HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;//模拟请求 request.Timeout = 30 * 1000;//设置30s的超时 request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"; request.ContentType = "application/json;charset=utf-8"; request.Referer = "https://ke.qq.com/admin/index.html"; request.Accept = "*/*"; request.Headers.Add("authority", "ke.qq.com"); request.Headers.Add("method", "GET"); request.Headers.Add("path", "/cgi-agency/agency/manager/get_course_list?aid=80207&count=10&page=1&pay_type=2&only_owner=1&cname_key=&course_state=0&signup_state=0&sort_method=0&is_ascending=0&bkn=449651946&t=0.0029"); request.Headers.Add("scheme", "https"); request.Headers.Add("accept-encodin", "gzip, deflate, br"); request.Headers.Add("accept-language", "zh-CN,zh;q=0.9"); request.Headers.Add("sec-fetch-mode", "cors"); request.Headers.Add("sec-fetch-site", "same-origin"); request.CookieContainer = new CookieContainer();//1 给请求准备个container request.Headers.Add("cookie", @"pgv_pvi=5087136768; RK=WXbobkaNsr; ptcz=447bb81ae5a54414d650b3476dced39f0cb80683ac471ec83f6cd827e4fdf5b7; pgv_pvid=3854687776; ts_uid=2094536335; localInterest=[2002]; iswebp=1; pac_uid=1_2751435708; ied_qq=o2751435708; isHideDealTips=1; ts_refer=pay.qq.com/enterprise/separate.shtml; tvfe_boss_uuid=86ed223020e11b4d; o_cookie=2751435708; ke_login_type=1; luin=o2751435708; course_origin=[{'cid':297038,'ext':{'pagelocation':'list, 1.3'}}]; tdw_data_testid=; tdw_data_flowid=; tdw_auin_data=-; tdw_first_visited=1; pgv_info=ssid=s6817397715; Hm_lvt_0c196c536f609d373a16d246a117fd44=1575856560,1575893514,1575896171,1575951173; index_new_key={'index_interest_cate_id':2002}; _qpsvr_localtk=0.05340837932628206; pgv_si=s9919754240; miniapp_qrcode_id=0115055f128549d785b0b1593b6084e4; uin=o2751435708; skey=@iGqJqPF6r; ptisp=cnc; lskey=000100005c4c2b2b6499fad1306a5536e639f4503825b33a2f4ee7f6c8ad4b5d43621199c8c99163c07c78c0; p_uin=o2751435708; pt4_token=SXdux*M4NNFzFRs6r59i-wKIVO4xk1AofSqAo8pvGg8_; p_skey=B4sUwtIxpxoGHY7OrQue0wyl2KkTlW4-Jr0MPc7Vf88_; p_luin=o2751435708; p_lskey=000400007af2a05b862ff9c18913d8cb474fb1c0c9f957de9f12b024a4372a5544cd95ecca989efba4edd06a; _pathcode=0.8704610796361238; tdw_data_sessionid=157595933804520618739236; ts_last=ke.qq.com/course/list; Hm_lpvt_0c196c536f609d373a16d246a117fd44=1575959482; tdw_data_new_2={'auin':' - ','sourcetype':'tuin','sourcefrom':'a3ff93bc','ver9':2751435708,'uin':2751435708,'visitor_id':'8882822910493176','url_page':'index','url_module':'searchbar','url_position':''}; tdw_data={'ver4':'4','ver5':'','ver6':'','refer':'','from_channel':'','path':'aBar - 0.8704610796361238','auin':' - ','uin':2751435708,'real_uin':2751435708}"); //如何自动读取cookie using (HttpWebResponse response = request.GetResponse() as HttpWebResponse)//发起请求 { if (response.StatusCode != HttpStatusCode.OK) { logger.Warn(string.Format("抓取{0}地址返回失败,response.StatusCode为{1}", url, response.StatusCode)); } else { try { //string sessionValue = response.Cookies["ASP.NET_SessionId"].Value;//2 读取cookie StreamReader sr = new StreamReader(response.GetResponseStream(), encode); html = sr.ReadToEnd();//读取数据 sr.Close(); } catch (Exception ex) { logger.Error(string.Format($"DownloadHtml抓取{url}失败"), ex); html = null; } } } } catch (System.Net.WebException ex) { if (ex.Message.Equals("远程服务器返回错误: (306)。")) { logger.Error("远程服务器返回错误: (306)。", ex); html = null; } } catch (Exception ex) { logger.Error(string.Format("DownloadHtml抓取{0}出现异常", url), ex); html = null; } return html; } }
调用
string url = $"{Constant.TencentClassUrl}/course/list/?tuin=7e4f8b7d"; string html = HttpHelper.DownloadUrl(url);
2. 筛选,过滤,获取有用的信息
用HtmlAgilityPack读取HTMl内容
HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html);
使用XPath筛选HMTL的元素节点,获取有用信息
string path = "//*[@id='auto-test-1']/div[1]/dl/dd"; HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(path); foreach (HtmlNode node in nodeList) { HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(node.InnerHtml); string codePath = "//a"; HtmlNode codeNode = htmlDocument.DocumentNode.SelectSingleNode(codePath); string href = codeNode.Attributes["href"].Value; if (!string.IsNullOrWhiteSpace(href)) { href = href.Replace(";", "&"); } string code = string.Empty; if (href != null && href.IndexOf("st=") != -1) { href = href.Replace(";", "&"); code = href.Substring(href.IndexOf("st=") + 3, 4); } TencentCategoryEntity category = new TencentCategoryEntity() { Id = _Count++, State = 1, CategoryLevel = 2, Code = code, ParentCode = parentCode }; category.Name = codeNode.InnerText; category.Url = href; categoryList.Add(category); if (codeNode.InnerText != "全部") { categoryList.AddRange(this.Third($"{Constant.TencentClassUrl}{href}&tuin=7e4f8b7d", code)); } } return categoryList;
3. 数据汇总,入库
获取某一页的数据
public void Show(string url) { string strHtml = HttpHelper.DownloadUrl(url); HtmlDocument document = new HtmlDocument(); document.LoadHtml(strHtml); string liPath = "/html/body/section[1]/div/div[@class='market-bd market-bd-6 course-list course-card-list-multi-wrap js-course-list']/ul/li"; HtmlNodeCollection liNodes = document.DocumentNode.SelectNodes(liPath); foreach (var node in liNodes) { Console.WriteLine(); Console.WriteLine(); Console.WriteLine("************************************************"); HtmlDocument lidocument = new HtmlDocument(); lidocument.LoadHtml(node.OuterHtml); string aPath = "//*/a[1]"; HtmlNode classANode = lidocument.DocumentNode.SelectSingleNode(aPath); string aHref = classANode.Attributes["href"].Value; Console.WriteLine($"课程Url:{aHref}"); string Id = classANode.Attributes["data-id"].Value; Console.WriteLine($"课程Id:{Id}"); string imgPath = "//*/a[1]/img"; HtmlNode imgNode = lidocument.DocumentNode.SelectSingleNode(imgPath); string imgUrl = imgNode.Attributes["src"].Value; Console.WriteLine($"ImageUrl:{imgUrl}"); string namePaths = "//*/h4/a[1]"; HtmlNode nameNode = lidocument.DocumentNode.SelectSingleNode(namePaths); string name = nameNode.InnerText; Console.WriteLine(name); Console.WriteLine($"课程名称:{name}"); // courseEntity.Price = new Random().Next(100, 10000); //关于腾讯课堂上的课程价格抓取 这是一个进阶内容 通过普通方式搞不了(他有一个自己的算法) count = count + 1; } }
public void ShowPageData(string url) { string strHtml = HttpHelper.DownloadUrl(url); HtmlDocument document = new HtmlDocument(); document.LoadHtml(strHtml); string pagePath = "/html/body/section[1]/div/div[5]/a[@class='page-btn']"; HtmlNodeCollection pageNodes = document.DocumentNode.SelectNodes(pagePath); int maxPage = pageNodes.Select(p => int.Parse(p.InnerText)).Max(); for (int page = 1; page <= maxPage; page++) { string pageUrl = $"{url}&page={page}"; Show(pageUrl); } Console.WriteLine($"一共抓取数据{count}条"); }
分页获取
#region 分页抓取 private void GetPageCourseData() { //1. 确定总页数 //2. 分别抓取每一页的数据 //3. 分析 过滤 清洗 //4. 入库 category.Url = $"{Constant.TencentClassUrl}{category.Url}"; string strHtml = HttpHelper.DownloadUrl(category.Url); HtmlDocument document = new HtmlDocument(); document.LoadHtml(strHtml); //Xpath string pagePath = "/html/body/section[1]/div/div[@class='sort-page']/a[@class='page-btn']"; HtmlNodeCollection pageNodes = document.DocumentNode.SelectNodes(pagePath); int pageCount = 1; if (pageNodes != null) { pageCount = pageNodes.Select(a => int.Parse(a.InnerText)).Max(); } List<CourseEntity> courseList = new List<CourseEntity>(); for (int pageIndex = 1; pageIndex <= pageCount; pageIndex++) { Console.WriteLine($"******************************当前是第{pageIndex}页数据************************************"); string pageIndexUrl = $"{category.Url}&page={pageIndex}"; List<CourseEntity> courseEntities = GetPageIndeData(pageIndexUrl); courseList.AddRange(courseEntities); } courseRepository.SaveList(courseList); } private List<CourseEntity> GetPageIndeData(string url) { //获取li标签里面的数据 // 先获取所有的Li // 然后循环获取li中的有效数据 string strHtml = HttpHelper.DownloadUrl(url); HtmlDocument document = new HtmlDocument(); document.LoadHtml(strHtml); string liPath = "/html/body/section[1]/div/div[@class='market-bd market-bd-6 course-list course-card-list-multi-wrap js-course-list']/ul/li"; HtmlNodeCollection liNodes = document.DocumentNode.SelectNodes(liPath); List<CourseEntity> courseEntities = new List<CourseEntity>(); foreach (var node in liNodes) { CourseEntity courseEntity = GetLiData(node); courseEntities.Add(courseEntity); } return courseEntities; } /// <summary> /// 当我们把这些数据获取到以后,那就应该保存起来 /// </summary> /// <param name="node"></param> private CourseEntity GetLiData(HtmlNode node) { CourseEntity courseEntity = new CourseEntity(); //从这里开始 HtmlDocument document = new HtmlDocument(); document.LoadHtml(node.OuterHtml); string aPath = "//*/a[1]"; HtmlNode classANode = document.DocumentNode.SelectSingleNode(aPath); string aHref = classANode.Attributes["href"].Value; courseEntity.Url = aHref; Console.WriteLine($"课程Url:{aHref}"); string Id = classANode.Attributes["data-id"].Value; Console.WriteLine($"课程Id:{Id}"); courseEntity.CourseId = long.Parse(Id); string imgPath = "//*/a[1]/img"; HtmlNode imgNode = document.DocumentNode.SelectSingleNode(imgPath); string imgUrl = imgNode.Attributes["src"].Value; courseEntity.ImageUrl = imgUrl; Console.WriteLine($"ImageUrl:{imgUrl}"); string namePaths = "//*/h4/a[1]"; HtmlNode nameNode = document.DocumentNode.SelectSingleNode(namePaths); string name = nameNode.InnerText; courseEntity.Title = name; Console.WriteLine($"课程名称:{name}"); courseEntity.Price = new Random().Next(100, 10000); //关于腾讯课堂上的课程价格抓取 这是一个进阶内容 通过普通方式搞不了(他有一个自己的算法) return courseEntity; } #endregion
入库操作
//分表之后, 以相对均匀的数据量存储到 不同的数据表中 //这是为了提高数据库的性能,其实是降低每一张表的,数据操作压力 //如果爬虫爬取的是亿万级数据,这种分表的方式是有必要的 public void SaveList(List<CourseEntity> courseList) { if (courseList == null || courseList.Count == 0) return; IEnumerable<IGrouping<string, CourseEntity>> group = courseList.GroupBy<CourseEntity, string>(c => GetTableName(c)); foreach (var data in group) { SqlHelper.InsertList<CourseEntity>(data.ToList(), data.Key); } } private string GetTableName(CourseEntity course) { return string.Format("Tencent_Subject_{0}", (course.CourseId % 30 + 1).ToString("000")); }
public class SqlHelper { private static Logger logger = new Logger(typeof(SqlHelper)); private static string _ConnStr = ConfigurationManager.ConnectionStrings["conn"].ConnectionString; /// <summary> /// 事务执行 /// </summary> /// <param name="sql"></param> public static void ExecuteNonQuery(string sql) { using (SqlConnection sqlConn = new SqlConnection(_ConnStr)) { sqlConn.Open(); SqlCommand cmd = new SqlCommand(sql, sqlConn); cmd.ExecuteNonQuery();//.ExecuteNonQueryAsync();// } } public static void ExecuteNonQueryWithTrans(string sql) { SqlTransaction trans = null; try { using (SqlConnection sqlConn = new SqlConnection(_ConnStr)) { sqlConn.Open(); trans = sqlConn.BeginTransaction(); SqlCommand cmd = new SqlCommand(sql, sqlConn, trans); cmd.ExecuteNonQuery();//.ExecuteNonQueryAsync();// trans.Commit(); } } catch (Exception ex) { //logger.Error(string.Format("ExecuteNonQueryWithTrans出现异常,sql={0}", sql), ex); if (trans != null && trans.Connection != null) trans.Rollback(); throw ex; } finally { } } public static List<T> QueryList<T>(string sql) where T : new() { using (SqlConnection sqlConn = new SqlConnection(_ConnStr)) { sqlConn.Open(); SqlCommand cmd = new SqlCommand(sql, sqlConn); return TransList<T>(cmd.ExecuteReader()); } } public static void Insert<T>(T model, string tableName) where T : new() { string sql = GetInsertSql<T>(model, tableName); ExecuteNonQuery(sql); } public static void InsertList<T>(List<T> list, string tableName) where T : new() { string sql = string.Join(" ", list.Select(t => GetInsertSql<T>(t, tableName))); ExecuteNonQuery(sql); } #region Private private static string GetInsertSql<T>(T model, string tableName) { StringBuilder sbSql = new StringBuilder(); StringBuilder sbFields = new StringBuilder(); StringBuilder sbValues = new StringBuilder(); Type type = model.GetType(); var properties = type.GetProperties(); foreach (PropertyInfo p in properties) { string name = p.Name; if (!name.Equals("id", StringComparison.OrdinalIgnoreCase)) { sbFields.AppendFormat("[{0}],", name); string sValue = null; object oValue = p.GetValue(model); if (oValue != null) sValue = oValue.ToString().Replace("'", ""); sbValues.AppendFormat("'{0}',", sValue); } } sbSql.AppendFormat("INSERT INTO {0} ({1}) VALUES ({2});", tableName, sbFields.ToString().TrimEnd(','), sbValues.ToString().TrimEnd(',')); return sbSql.ToString(); } private static List<T> TransList<T>(SqlDataReader reader) where T : new() { List<T> tList = new List<T>(); Type type = typeof(T); var properties = type.GetProperties(); if (reader.Read()) { do { T t = new T(); foreach (PropertyInfo p in properties) { p.SetValue(t, Convert.ChangeType(reader[p.Name], p.PropertyType)); } tList.Add(t); } while (reader.Read()); } return tList; } private static T TransModel<T>(SqlDataReader reader) where T : new() { T t = new T(); if (reader.Read()) { do { Type type = typeof(T); var properties = type.GetProperties(); foreach (PropertyInfo p in properties) { p.SetValue(t, Convert.ChangeType(reader[p.Name], p.PropertyType)); } } while (reader.Read()); } return t; } #endregion Private }
4. 多线程抓取数据,入库
public class CrawlerCenter { private static Logger logger = new Logger(typeof(CrawlerCenter)); /// <summary> /// 抓取 /// </summary> public static void Handler() { Console.WriteLine("请输入Y/N进行类别表初始化确认! Y 删除Tencent_Category表然后重新创建,然后抓取类型数据,N(或者其他)跳过"); string input = Console.ReadLine(); if (input.Equals("Y", StringComparison.OrdinalIgnoreCase)) { DBInit.InitCategoryTable(); CrawlerCategory(); } else { Console.WriteLine("你选择不初始化类别数据"); } Console.WriteLine("*****************^_^**********************"); Console.WriteLine("请输入Y/N进行课程数据初始化确认! Y 删除全部课程数据表然后重新创建,然后抓取课程数据,N(或者其他)跳过"); input = Console.ReadLine(); if (input.Equals("Y", StringComparison.OrdinalIgnoreCase)) { DBInit.InitTencentSubjectTable(); CrawlerCourse(); } Console.WriteLine("*****************^_^**********************"); } private static void CrawlerCategory() { Console.WriteLine($"{ DateTime.Now} 腾讯课堂类目开始抓取 - -"); ISearch search = new CategorySearch(); search.Crawler(); } /// <summary> /// /// </summary> public static void CrawlerCourse() { Console.WriteLine($"{ DateTime.Now} 腾讯课堂课程开始抓取 - -"); CategoryRepository categoryRepository = new CategoryRepository(); //先从数据库拿到第三级类目信息 List<TencentCategoryEntity> categoryList = categoryRepository.QueryListByLevel(3); var categoryquery = categoryList.Where(a => !a.Name.Contains("全部")); //TaskFactory taskFactory = Task.Factory; //List<Task> taskList = new List<Task>(); //foreach (TencentCategoryEntity category in categoryquery) //{ // ISearch searcher = new CourseSearch(category); // taskList.Add(taskFactory.StartNew(searcher.Crawler)); // if (taskList.Count >= 10) //当前最多开启10个线程 // { // taskList = taskList.Where(t => !t.IsCompleted && !t.IsCanceled && !t.IsFaulted).ToList(); // Task.WaitAny(taskList.ToArray());//表示等待一个线程完成之后继续往后 // } //} //Task.WaitAll(taskList.ToArray()); List<Action> actions = new List<Action>(); foreach (TencentCategoryEntity category in categoryquery) { ISearch searcher = new CourseSearch(category); actions.Add(new Action(searcher.Crawler)); } //需要控制数量 ParallelOptions options = new ParallelOptions(); options.MaxDegreeOfParallelism = 10; //控制了最大的线程数量是10 个线程 Parallel.Invoke(options, actions.ToArray()); //Parallel 表示有多少个委托 就开启多少个线程 Console.WriteLine($"{ DateTime.Now} 腾讯课堂所有课程抓取完毕 - -"); } /// <summary> /// 清理重复数据 /// </summary> private static void CleanAll() { try { Console.WriteLine($"{ DateTime.Now} 开始清理重复数据 - -"); StringBuilder sb = new StringBuilder(); for (int i = 1; i < 31; i++) { sb.AppendFormat(@"DELETE FROM [dbo].[JD_Commodity_{0}] where productid IN(select productid from [dbo].[JD_Commodity_{0}] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_{0}] group by productid,CategoryId having count(0)>1);", i.ToString("000")); } #region /* DELETE FROM [dbo].[JD_Commodity_001] where productid IN(select productid from [dbo].[JD_Commodity_001] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_001] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_002] where productid IN(select productid from [dbo].[JD_Commodity_002] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_002] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_003] where productid IN(select productid from [dbo].[JD_Commodity_003] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_003] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_004] where productid IN(select productid from [dbo].[JD_Commodity_004] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_004] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_005] where productid IN(select productid from [dbo].[JD_Commodity_005] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_005] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_006] where productid IN(select productid from [dbo].[JD_Commodity_006] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_006] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_007] where productid IN(select productid from [dbo].[JD_Commodity_007] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as IDv from [dbo].[JD_Commodity_007] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_008] where productid IN(select productid from [dbo].[JD_Commodity_008] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_008] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_009] where productid IN(select productid from [dbo].[JD_Commodity_009] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_009] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_010] where productid IN(select productid from [dbo].[JD_Commodity_010] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_010] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_011] where productid IN(select productid from [dbo].[JD_Commodity_011] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_011] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_012] where productid IN(select productid from [dbo].[JD_Commodity_012] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_012] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_013] where productid IN(select productid from [dbo].[JD_Commodity_013] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_013] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_014] where productid IN(select productid from [dbo].[JD_Commodity_014] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_014] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_015] where productid IN(select productid from [dbo].[JD_Commodity_015] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_015] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_016] where productid IN(select productid from [dbo].[JD_Commodity_016] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_016] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_017] where productid IN(select productid from [dbo].[JD_Commodity_017] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_017] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_018] where productid IN(select productid from [dbo].[JD_Commodity_018] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_018] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_019] where productid IN(select productid from [dbo].[JD_Commodity_019] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_019] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_020] where productid IN(select productid from [dbo].[JD_Commodity_020] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_020] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_021] where productid IN(select productid from [dbo].[JD_Commodity_021] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_021] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_022] where productid IN(select productid from [dbo].[JD_Commodity_022] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_022] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_023] where productid IN(select productid from [dbo].[JD_Commodity_023] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_023] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_024] where productid IN(select productid from [dbo].[JD_Commodity_024] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_024] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_025] where productid IN(select productid from [dbo].[JD_Commodity_025] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_025] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_026] where productid IN(select productid from [dbo].[JD_Commodity_026] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_026] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_027] where productid IN(select productid from [dbo].[JD_Commodity_027] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_027] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_028] where productid IN(select productid from [dbo].[JD_Commodity_028] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_028] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_029] where productid IN(select productid from [dbo].[JD_Commodity_029] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_029] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_030] where productid IN(select productid from [dbo].[JD_Commodity_030] group by productid,CategoryId having count(0)>1) AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_030] group by productid,CategoryId having count(0)>1); */ #endregion Console.WriteLine("执行清理sql:{0}", sb.ToString()); SqlHelper.ExecuteNonQuery(sb.ToString()); Console.WriteLine("{0} 完成清理重复数据 - -", DateTime.Now); } catch (Exception ex) { logger.Error("CleanAll出现异常", ex); } finally { Console.WriteLine("{0} 结束清理重复数据 - -", DateTime.Now); } } }
/// <summary> /// 谨慎使用 会全部删除数据库并重新创建! /// </summary> public static void InitCategoryTable() { #region Delete try { StringBuilder sb = new StringBuilder(); sb.AppendFormat("DROP TABLE [dbo].[Tencent_Category];"); SqlHelper.ExecuteNonQuery(sb.ToString()); } catch (Exception ex) { if (ex.Message.Equals("无法对 表 'dbo.Tencent_Category' 执行 删除,因为它不存在,或者您没有所需的权限。")) { logger.Warn("初始化数据库InitCategoryTable删除的时候,原表不存在"); } else { logger.Error("初始化数据库InitTencent_CategoryTable失败", ex); throw ex; } } #endregion Delete #region Create try { StringBuilder sb = new StringBuilder(); sb.AppendFormat(@"CREATE TABLE [dbo].[Tencent_Category]( [Id] [int] IDENTITY(1,1) NOT NULL, [Code] [varchar](100) NULL, [ParentCode] [varchar](100) NULL, [CategoryLevel] [int] NULL, [Name] [nvarchar](50) NULL, [Url] [varchar](1000) NULL, [State] [int] NULL, CONSTRAINT [PK_Tencent_Category] PRIMARY KEY CLUSTERED ( [Id] ASC )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY] ) ON [PRIMARY];"); SqlHelper.ExecuteNonQuery(sb.ToString()); } catch (Exception ex) { logger.Error("初始化数据库InitTencent_Category Table 创建失败", ex); throw ex; } #endregion Create }
/// <summary> /// 谨慎使用 会全部删除数据库并重新创建! /// </summary> public static void InitTencentSubjectTable() { #region Delete try { StringBuilder sb = new StringBuilder(); for (int i = 1; i < 31; i++) { sb.AppendFormat("DROP TABLE [dbo].[Tencent_Subject_{0}];", i.ToString("000")); } SqlHelper.ExecuteNonQuery(sb.ToString()); } catch (Exception ex) { if (ex.Message.Contains("因为它不存在,或者您没有所需的权限。")) { logger.Warn("初始化数据库InitTencent_SubjectTable删除的时候,原表不存在"); } else { logger.Error("初始化数据库InitTencent_SubjectTable失败", ex); throw ex; } } #endregion Delete #region Create try { // 直接定义了30张表 StringBuilder sb = new StringBuilder(); for (int i = 1; i < 31; i++) { sb.AppendFormat(@"CREATE TABLE [dbo].[Tencent_Subject_{0}]( [Id] [int] IDENTITY(1,1) NOT NULL, [CourseId] [bigint] NULL, [CategoryId] [int] NULL, [Title] [nvarchar](500) NULL, [Price] [decimal](18, 2) NULL, [Url] [varchar](1000) NULL, [ImageUrl] [varchar](1000) NULL, CONSTRAINT [PK_Tencent_Subject_{0}] PRIMARY KEY CLUSTERED ( [Id] ASC )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY] ) ON [PRIMARY];", i.ToString("000")); } SqlHelper.ExecuteNonQuery(sb.ToString()); } catch (Exception ex) { logger.Error("InitInitTencent_SubjectTable创建异常", ex); throw ex; } #endregion Create }
付费内容,请联系本人QQ:1002453261
本文来自博客园,作者:明志德道,转载请注明原文链接:https://www.cnblogs.com/for-easy-fast/articles/12460627.html
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 一个奇形怪状的面试题:Bean中的CHM要不要加volatile?
· [.NET]调用本地 Deepseek 模型
· 一个费力不讨好的项目,让我损失了近一半的绩效!
· 在鹅厂做java开发是什么体验
· 百万级群聊的设计实践
· WPF到Web的无缝过渡:英雄联盟客户端的OpenSilver迁移实战
· 永远不要相信用户的输入:从 SQL 注入攻防看输入验证的重要性
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析