C# 采集知网
采集知网
WebClient
/// <summary> /// 支持 Session 和 Cookie 的 WebClient。 /// </summary> public class WebClientHelper:WebClient { // Cookie 容器 private CookieContainer cookieContainer; /// <summary> /// 创建一个新的 WebClient 实例。 /// </summary> public WebClientHelper() { this.cookieContainer = new CookieContainer(); } /// <summary> /// 创建一个新的 WebClient 实例。 /// </summary> /// <param name="cookie">Cookie 容器</param> public WebClientHelper(CookieContainer cookies) { this.cookieContainer = cookies; } /// <summary> /// Cookie 容器 /// </summary> public CookieContainer Cookies { get { return this.cookieContainer; } set { this.cookieContainer = value; } } // 设置自己想要添加的登陆信息等cookie public void SetCustCookie(string name,string value,string path,string domain) { CookieCollection cookies = new CookieCollection(); cookies.Add(new Cookie(name.Trim(),value.Trim(),path,domain)); Cookies.Add(cookies); } /// <summary> /// 返回带有 Cookie 的 HttpWebRequest。 /// </summary> /// <param name="address"></param> /// <returns></returns> protected override WebRequest GetWebRequest(Uri address) { WebRequest request = base.GetWebRequest(address); if(request is HttpWebRequest) { HttpWebRequest httpRequest = request as HttpWebRequest; httpRequest.CookieContainer = cookieContainer; } return request; } #region 封装了PostData, GetSrc 和 GetFile 方法 /// <summary> /// 向指定的 URL POST 数据,并返回页面 /// </summary> /// <param name="uriString">POST URL</param> /// <param name="postString">POST 的 数据</param> /// <param name="postStringEncoding">POST 数据的 CharSet https://blog.csdn.net/gengyiping18/article/details/77620061</param> /// <param name="dataEncoding">页面的 CharSet</param> /// <returns>页面的源文件</returns> public string PostData(string uriString,string postString,string postStringEncoding = "utf-8",string dataEncoding = "utf-8") { try { // 将 Post 字符串转换成字节数组 byte[] postData = Encoding.GetEncoding(postStringEncoding).GetBytes(postString); this.Headers.Add("Content-Type","application/x-www-form-urlencoded"); // 上传数据,返回页面的字节数组 byte[] responseData = this.UploadData(uriString,"POST",postData); string srcString = Encoding.GetEncoding(dataEncoding).GetString(responseData); srcString = srcString.Replace("\t",""); srcString = srcString.Replace("\r",""); srcString = srcString.Replace("\n",""); return srcString; } catch(WebException we) { Logger.LogError("PostData:" + uriString + "?" + postString,we); return "error"; } } /// <summary> /// 获得指定 URL 的源文件 /// </summary> /// <param name="uriString">页面 URL</param> /// <param name="dataEncoding">页面的 CharSet</param> /// <returns>页面的源文件</returns> public string GetSrc(string uriString,string dataEncoding = "utf-8") { try { // 返回页面的字节数组 byte[] responseData = this.DownloadData(uriString); // 将返回的将字节数组转换成字符串(HTML); string srcString = Encoding.GetEncoding(dataEncoding).GetString(responseData); srcString = srcString.Replace("\t",""); srcString = srcString.Replace("\r",""); srcString = srcString.Replace("\n",""); return srcString; } catch(WebException we) { Logger.LogError("GetSrc:" + uriString,we); return "error"; } } /// <summary> /// 从指定的 URL 下载文件到本地 /// </summary> /// <param name="uriString">文件 URL</param> /// <param name="fileName">本地文件的完成路径</param> /// <returns></returns> public bool GetFile(string urlString,string fileName,out string msg) { try { this.DownloadFile(urlString,fileName); msg = string.Empty; return true; } catch(WebException we) { msg = we.Message; return false; } } /// <summary> /// 遍历CookieContainer /// </summary> /// <param name="cc"></param> /// <returns></returns> public List<Cookie> GetAllCookies(CookieContainer cc) { List<Cookie> lstCookies = new List<Cookie>(); Hashtable table = (Hashtable)cc.GetType().InvokeMember("m_domainTable", System.Reflection.BindingFlags.NonPublic|System.Reflection.BindingFlags.GetField| System.Reflection.BindingFlags.Instance,null,cc,new object[] { }); foreach(object pathList in table.Values) { SortedList lstCookieCol = (SortedList)pathList.GetType().InvokeMember("m_list", System.Reflection.BindingFlags.NonPublic|System.Reflection.BindingFlags.GetField |System.Reflection.BindingFlags.Instance,null,pathList,new object[] { }); foreach(CookieCollection colCookies in lstCookieCol.Values) foreach(Cookie c in colCookies) lstCookies.Add(c); } return lstCookies; } public List<Cookie> GetAllCookies() { CookieContainer cc = this.cookieContainer; List<Cookie> lstCookies = new List<Cookie>(); Hashtable table = (Hashtable)cc.GetType().InvokeMember("m_domainTable", System.Reflection.BindingFlags.NonPublic|System.Reflection.BindingFlags.GetField| System.Reflection.BindingFlags.Instance,null,cc,new object[] { }); foreach(object pathList in table.Values) { SortedList lstCookieCol = (SortedList)pathList.GetType().InvokeMember("m_list", System.Reflection.BindingFlags.NonPublic|System.Reflection.BindingFlags.GetField |System.Reflection.BindingFlags.Instance,null,pathList,new object[] { }); foreach(CookieCollection colCookies in lstCookieCol.Values) foreach(Cookie c in colCookies) lstCookies.Add(c); } return lstCookies; } #endregion 封装了PostData, GetSrc 和 GetFile 方法 }
MAIN 采集知网("知网","",0,0);
public void 采集知网(string type,string StrUnit,int minPage,int maxPage) { if(string.IsNullOrWhiteSpace(type)) type="知网"; if(string.IsNullOrWhiteSpace(StrUnit)) StrUnit=BaseDAL.GetSystemInstitutions(); if(minPage<=0) minPage=1; if(maxPage<=0) maxPage=10; string url = "https://kns.cnki.net/kns/request/SearchHandler.ashx"; Dictionary<string,string> postData = new Dictionary<string,string>(); postData.Add("pagename","brief_result_aspx"); postData.Add("dbPrefix","SCDB"); postData.Add("dbCatalog","中国学术文献网络出版总库"); postData.Add("ConfigFile","SCDB.xml"); postData.Add("db_opt","CJFQ,CDFD,CMFD,CPFD,IPFD,CCND,CCJD"); postData.Add("danwei_1_sel","AF"); postData.Add("danwei_1_value1",""+StrUnit+""); postData.Add("publishdate_from",DateTime.Now.AddDays(-30).ToString("yyyy-MM-dd"));//2022-6-18 postData.Add("publishdate_to",DateTime.Now.ToString("yyyy-MM-dd"));//2022-6-18 postData.Add("danwei_1_special1","="); postData.Add("isinEn","1"); string postDataStr = postData.ToUrlParams(); //建立会话 WebClientHelper webClient = new WebClientHelper(); string mainData = webClient.PostData(url,postDataStr); var cli = webClient.GetAllCookies(); string cookie = cli[0].ToString(); Dictionary<string,string> header = new Dictionary<string,string>(); header.Add("Cookie",cookie); // 查询获取总条数 url="https://kns.cnki.net/kns/brief/brief.aspx?pagename=ASP.brief_result_aspx&isinEn=1&dbPrefix=SCDB&dbCatalog=%e4%b8%ad%e5%9b%bd%e5%ad%a6%e6%9c%af%e6%96%87%e7%8c%ae%e7%bd%91%e7%bb%9c%e5%87%ba%e7%89%88%e6%80%bb%e5%ba%93&ConfigFile=SCDB.xml&research=off&t=1592202301266"; string res = webClient.PostData(url,postDataStr); string preg_cookie = "id=\"resultcount\" name=\"resultcount\" value=\"([0-9]*)\""; var rg = Regex.Match(res,preg_cookie); var num = rg.Groups[1].Value.ToDouble(); Console.WriteLine("num:"+num); preg_cookie="QueryID=[0-9]*"; rg=Regex.Match(res,preg_cookie,RegexOptions.Singleline); string QueryID = rg.Value.Replace("QueryID=",""); Console.WriteLine("QueryID:"+QueryID); if(num<=0) { Console.WriteLine("没有采集到新数据"); return; } var cot = Math.Round(num/50); //分页获取所有数据 string data = ""; for(int i = minPage;i<=cot;i++) { if(i>maxPage) break; url="https://kns.cnki.net/kns/brief/brief.aspx?curpage="+i+"&RecordsPerPage=50&QueryID="+QueryID+"&ID=&turnpage=1&tpagemode=L&dbPrefix=SCDB&Fields=&DisplayMode=custommode&PageName=ASP.brief_result_aspx&sKuaKuID=0&isinEn=1&"; res=webClient.PostData(url,postDataStr); data+=res; } var regex = new Regex(@"<li>(.*?)<\/li>"); var ret = regex.Matches(data); Dictionary<string,string> pattern = new Dictionary<string,string>(); pattern.Add("title",@"class=""title_c""><a href='(.*)' target='_blank'>(.*)</a>(.*)</h3>"); pattern.Add("author",@"class=""author""> (.*),</span>"); pattern.Add("journal",@"class=""journal"">[\s\S]*?<\/span>"); pattern.Add("database",@"class=""database"">(.*?)</span>"); pattern.Add("pub_dates",@"<label>.*?发表时间:(.*)<\/label>"); pattern.Add("abstract_c",@"<p.*?class=""abstract_c"">[\s\S]*?<\/p>"); List<LunWen> modes = new List<LunWen>(); foreach(Match mat in ret) { LunWen model = new LunWen(); model.wt_Type=type; model.wt_DownDate=DateTime.Now; string row = mat.Groups[1].Value; var title = Regex.Match(row,pattern["title"],RegexOptions.Singleline); if(!string.IsNullOrWhiteSpace(title.Value)) { model.wt_Title=title.Groups[2].Value; model.wt_Url="https://kns.cnki.net"+title.Groups[1].Value; //作者 var authorAndLink = Regex.Match(row,pattern["author"]); if(!string.IsNullOrWhiteSpace(authorAndLink.Value)) { string authors_names = ""; var autReg = Regex.Matches(authorAndLink.Value,@"[\u4E00-\u9FFF]+",RegexOptions.IgnoreCase|RegexOptions.IgnorePatternWhitespace); for(int j = 0;j<autReg.Count;j++) { authors_names+=","+autReg[j].Value; } model.wt_Author=authors_names; } //单位 var journal = Regex.Match(row,pattern["journal"]); if(!string.IsNullOrWhiteSpace(journal.Value)) { model.wt_Unit=GetChineseWord(journal.Value); } //期刊论文 var database = Regex.Match(row,pattern["database"]); if(!string.IsNullOrWhiteSpace(database.Value)) { model.wt_ResouceType=GetChineseWord(database.Value); model.wt_Source=database.Groups[1].Value; } //2022年01期 var pub_dates = Regex.Match(row,pattern["pub_dates"]); if(!string.IsNullOrWhiteSpace(pub_dates.Value)) { model.wt_Volume=pub_dates.Groups[1].Value; } //内容 var abstract_c = Regex.Match(row,pattern["abstract_c"]); if(!string.IsNullOrWhiteSpace(abstract_c.Value)) { model.wt_Content=abstract_c.Value; } modes.Add(model); Console.WriteLine(type+":"+model.wt_Title); } } if(modes.Count>0) { Console.WriteLine("采集到新数据准备入库"); LunWenDal.AddBatch(modes); Console.WriteLine("采集到新数据已入库完成"); } else { Console.WriteLine("没有采集到新数据"); } }
获取中文:
public static string GetChineseWord(string oriText) { string x = @"[\u4E00-\u9FFF]+"; MatchCollection Matches = Regex.Matches(oriText,x,RegexOptions.IgnoreCase); StringBuilder sb = new StringBuilder(); foreach(Match NextMatch in Matches) { sb.Append(NextMatch.Value); } return sb.ToString(); }
实体类:
public partial class LunWen { /// <summary> /// 论文网推 LunWen /// </summary> public LunWen() { } /// <summary> /// wt_id /// </summary> public int wt_id { get; set; } /// <summary> /// 单位名称 /// </summary> public string wt_Unit { get; set; } /// <summary> /// 知网、万文 /// </summary> public string wt_Type { get; set; } /// <summary> /// 论文标题 /// </summary> public string wt_Title { get; set; } /// <summary> /// 论文地址 /// </summary> public string wt_Url { get; set; } /// <summary> /// 类型 /// </summary> public string wt_ResouceType { get; set; } /// <summary> /// 作者 /// </summary> public string wt_Author { get; set; } /// <summary> /// 来源 /// </summary> public string wt_Source { get; set; } /// <summary> /// 期刊 /// </summary> public string wt_Volume { get; set; } /// <summary> /// 内容 /// </summary> public string wt_Content { get; set; } /// <summary> /// 下载日期 /// </summary> public DateTime? wt_DownDate { get; set; } }
Python采集知网 https://www.cnblogs.com/luomingui/p/17701445.html