Go to my github

C# 采集知网

采集知网

WebClient

 /// <summary>
    /// 支持 Session 和 Cookie 的 WebClient。
    /// </summary>
    public class WebClientHelper:WebClient {

        // Cookie 容器
        private CookieContainer cookieContainer;

        /// <summary>
        /// 创建一个新的 WebClient 实例。
        /// </summary>
        public WebClientHelper() {
            this.cookieContainer = new CookieContainer();
        }

        /// <summary>
        /// 创建一个新的 WebClient 实例。
        /// </summary>
        /// <param name="cookie">Cookie 容器</param>
        public WebClientHelper(CookieContainer cookies) {
            this.cookieContainer = cookies;
        }

        /// <summary>
        /// Cookie 容器
        /// </summary>
        public CookieContainer Cookies {
            get { return this.cookieContainer; }
            set { this.cookieContainer = value; }
        }
        // 设置自己想要添加的登陆信息等cookie
        public void SetCustCookie(string name,string value,string path,string domain) {
            CookieCollection cookies = new CookieCollection();
            cookies.Add(new Cookie(name.Trim(),value.Trim(),path,domain));
            Cookies.Add(cookies);
        }
        /// <summary>
        /// 返回带有 Cookie 的 HttpWebRequest。
        /// </summary>
        /// <param name="address"></param>
        /// <returns></returns>
        protected override WebRequest GetWebRequest(Uri address) {
            WebRequest request = base.GetWebRequest(address);
            if(request is HttpWebRequest) {
                HttpWebRequest httpRequest = request as HttpWebRequest;
                httpRequest.CookieContainer = cookieContainer;
            }
            return request;
        }

        #region 封装了PostData, GetSrc 和 GetFile 方法

        /// <summary>
        /// 向指定的 URL POST 数据,并返回页面
        /// </summary>
        /// <param name="uriString">POST URL</param>
        /// <param name="postString">POST 的 数据</param>
        /// <param name="postStringEncoding">POST 数据的 CharSet https://blog.csdn.net/gengyiping18/article/details/77620061</param>
        /// <param name="dataEncoding">页面的 CharSet</param>
        /// <returns>页面的源文件</returns>
        public string PostData(string uriString,string postString,string postStringEncoding = "utf-8",string dataEncoding = "utf-8") {
            try {
                // 将 Post 字符串转换成字节数组
                byte[] postData = Encoding.GetEncoding(postStringEncoding).GetBytes(postString);
                this.Headers.Add("Content-Type","application/x-www-form-urlencoded");
                // 上传数据,返回页面的字节数组
                byte[] responseData = this.UploadData(uriString,"POST",postData);
                string srcString = Encoding.GetEncoding(dataEncoding).GetString(responseData);
                srcString = srcString.Replace("\t","");
                srcString = srcString.Replace("\r","");
                srcString = srcString.Replace("\n","");
                return srcString;
            } catch(WebException we) {
                Logger.LogError("PostData:" + uriString + "?" + postString,we);
                return "error";
            }
        }

        /// <summary>
        /// 获得指定 URL 的源文件
        /// </summary>
        /// <param name="uriString">页面 URL</param>
        /// <param name="dataEncoding">页面的 CharSet</param>
        /// <returns>页面的源文件</returns>
        public string GetSrc(string uriString,string dataEncoding = "utf-8") {
            try {
                // 返回页面的字节数组
                byte[] responseData = this.DownloadData(uriString);
                // 将返回的将字节数组转换成字符串(HTML);
                string srcString = Encoding.GetEncoding(dataEncoding).GetString(responseData);
                srcString = srcString.Replace("\t","");
                srcString = srcString.Replace("\r","");
                srcString = srcString.Replace("\n","");
                return srcString;
            } catch(WebException we) {
                Logger.LogError("GetSrc:" + uriString,we);
                return "error";
            }
        }

        /// <summary>
        /// 从指定的 URL 下载文件到本地
        /// </summary>
        /// <param name="uriString">文件 URL</param>
        /// <param name="fileName">本地文件的完成路径</param>
        /// <returns></returns>
        public bool GetFile(string urlString,string fileName,out string msg) {
            try {
                this.DownloadFile(urlString,fileName);
                msg = string.Empty;
                return true;
            } catch(WebException we) {
                msg = we.Message;
                return false;
            }
        }

        /// <summary>
        /// 遍历CookieContainer
        /// </summary>
        /// <param name="cc"></param>
        /// <returns></returns>
        public  List<Cookie> GetAllCookies(CookieContainer cc) {
            List<Cookie> lstCookies = new List<Cookie>();
            Hashtable table = (Hashtable)cc.GetType().InvokeMember("m_domainTable",
                System.Reflection.BindingFlags.NonPublic|System.Reflection.BindingFlags.GetField|
                System.Reflection.BindingFlags.Instance,null,cc,new object[] { });

            foreach(object pathList in table.Values) {
                SortedList lstCookieCol = (SortedList)pathList.GetType().InvokeMember("m_list",
                    System.Reflection.BindingFlags.NonPublic|System.Reflection.BindingFlags.GetField
                    |System.Reflection.BindingFlags.Instance,null,pathList,new object[] { });
                foreach(CookieCollection colCookies in lstCookieCol.Values)
                    foreach(Cookie c in colCookies) lstCookies.Add(c);
            }
            return lstCookies;
        }
        public List<Cookie> GetAllCookies() {
            CookieContainer cc = this.cookieContainer;
            List<Cookie> lstCookies = new List<Cookie>();
            Hashtable table = (Hashtable)cc.GetType().InvokeMember("m_domainTable",
                System.Reflection.BindingFlags.NonPublic|System.Reflection.BindingFlags.GetField|
                System.Reflection.BindingFlags.Instance,null,cc,new object[] { });

            foreach(object pathList in table.Values) {
                SortedList lstCookieCol = (SortedList)pathList.GetType().InvokeMember("m_list",
                    System.Reflection.BindingFlags.NonPublic|System.Reflection.BindingFlags.GetField
                    |System.Reflection.BindingFlags.Instance,null,pathList,new object[] { });
                foreach(CookieCollection colCookies in lstCookieCol.Values)
                    foreach(Cookie c in colCookies) lstCookies.Add(c);
            }
            return lstCookies;
        }
        #endregion 封装了PostData, GetSrc 和 GetFile 方法
    }

MAIN   采集知网("知网","",0,0);

  public void 采集知网(string type,string StrUnit,int minPage,int maxPage) {
            if(string.IsNullOrWhiteSpace(type)) type="知网";
            if(string.IsNullOrWhiteSpace(StrUnit)) StrUnit=BaseDAL.GetSystemInstitutions();
            if(minPage<=0) minPage=1;
            if(maxPage<=0) maxPage=10;
            string url = "https://kns.cnki.net/kns/request/SearchHandler.ashx";
            Dictionary<string,string> postData = new Dictionary<string,string>();
            postData.Add("pagename","brief_result_aspx");
            postData.Add("dbPrefix","SCDB");
            postData.Add("dbCatalog","中国学术文献网络出版总库");
            postData.Add("ConfigFile","SCDB.xml");
            postData.Add("db_opt","CJFQ,CDFD,CMFD,CPFD,IPFD,CCND,CCJD");
            postData.Add("danwei_1_sel","AF");
            postData.Add("danwei_1_value1",""+StrUnit+"");
            postData.Add("publishdate_from",DateTime.Now.AddDays(-30).ToString("yyyy-MM-dd"));//2022-6-18
            postData.Add("publishdate_to",DateTime.Now.ToString("yyyy-MM-dd"));//2022-6-18
            postData.Add("danwei_1_special1","=");
            postData.Add("isinEn","1");
            string postDataStr = postData.ToUrlParams();
            //建立会话
            WebClientHelper webClient = new WebClientHelper();
            string mainData = webClient.PostData(url,postDataStr);
            var cli = webClient.GetAllCookies();
            string cookie = cli[0].ToString();
            Dictionary<string,string> header = new Dictionary<string,string>();
            header.Add("Cookie",cookie);
            // 查询获取总条数
            url="https://kns.cnki.net/kns/brief/brief.aspx?pagename=ASP.brief_result_aspx&isinEn=1&dbPrefix=SCDB&dbCatalog=%e4%b8%ad%e5%9b%bd%e5%ad%a6%e6%9c%af%e6%96%87%e7%8c%ae%e7%bd%91%e7%bb%9c%e5%87%ba%e7%89%88%e6%80%bb%e5%ba%93&ConfigFile=SCDB.xml&research=off&t=1592202301266";
            string res = webClient.PostData(url,postDataStr);
            string preg_cookie = "id=\"resultcount\" name=\"resultcount\" value=\"([0-9]*)\"";
            var rg = Regex.Match(res,preg_cookie);
            var num = rg.Groups[1].Value.ToDouble();
            Console.WriteLine("num:"+num);
            preg_cookie="QueryID=[0-9]*";
            rg=Regex.Match(res,preg_cookie,RegexOptions.Singleline);
            string QueryID = rg.Value.Replace("QueryID=","");
            Console.WriteLine("QueryID:"+QueryID);
            if(num<=0) {
                Console.WriteLine("没有采集到新数据");
                return;
            }
            var cot = Math.Round(num/50);
            //分页获取所有数据
            string data = "";
            for(int i = minPage;i<=cot;i++) {
                if(i>maxPage) break;
                url="https://kns.cnki.net/kns/brief/brief.aspx?curpage="+i+"&RecordsPerPage=50&QueryID="+QueryID+"&ID=&turnpage=1&tpagemode=L&dbPrefix=SCDB&Fields=&DisplayMode=custommode&PageName=ASP.brief_result_aspx&sKuaKuID=0&isinEn=1&";
                res=webClient.PostData(url,postDataStr);
                data+=res;
            }
            var regex = new Regex(@"<li>(.*?)<\/li>");
            var ret = regex.Matches(data);
            Dictionary<string,string> pattern = new Dictionary<string,string>();
            pattern.Add("title",@"class=""title_c""><a href='(.*)' target='_blank'>(.*)</a>(.*)</h3>");
            pattern.Add("author",@"class=""author""> (.*),</span>");
            pattern.Add("journal",@"class=""journal"">[\s\S]*?<\/span>");
            pattern.Add("database",@"class=""database"">(.*?)</span>");
            pattern.Add("pub_dates",@"<label>.*?发表时间:(.*)<\/label>");
            pattern.Add("abstract_c",@"<p.*?class=""abstract_c"">[\s\S]*?<\/p>");

            List<LunWen> modes = new List<LunWen>();
            foreach(Match mat in ret) {
                LunWen model = new LunWen();
                model.wt_Type=type;
                model.wt_DownDate=DateTime.Now;
                string row = mat.Groups[1].Value;
                var title = Regex.Match(row,pattern["title"],RegexOptions.Singleline);
                if(!string.IsNullOrWhiteSpace(title.Value)) {
                    model.wt_Title=title.Groups[2].Value;
                    model.wt_Url="https://kns.cnki.net"+title.Groups[1].Value;
                    //作者
                    var authorAndLink = Regex.Match(row,pattern["author"]);
                    if(!string.IsNullOrWhiteSpace(authorAndLink.Value)) {
                        string authors_names = "";
                        var autReg = Regex.Matches(authorAndLink.Value,@"[\u4E00-\u9FFF]+",RegexOptions.IgnoreCase|RegexOptions.IgnorePatternWhitespace);
                        for(int j = 0;j<autReg.Count;j++) {
                            authors_names+=","+autReg[j].Value;
                        }
                        model.wt_Author=authors_names;
                    }
                    //单位
                    var journal = Regex.Match(row,pattern["journal"]);
                    if(!string.IsNullOrWhiteSpace(journal.Value)) {
                        model.wt_Unit=GetChineseWord(journal.Value);
                    }
                    //期刊论文
                    var database = Regex.Match(row,pattern["database"]);
                    if(!string.IsNullOrWhiteSpace(database.Value)) {
                        model.wt_ResouceType=GetChineseWord(database.Value);
                        model.wt_Source=database.Groups[1].Value;
                    }
                    //2022年01期 
                    var pub_dates = Regex.Match(row,pattern["pub_dates"]);
                    if(!string.IsNullOrWhiteSpace(pub_dates.Value)) {
                        model.wt_Volume=pub_dates.Groups[1].Value;
                    }
                    //内容 
                    var abstract_c = Regex.Match(row,pattern["abstract_c"]);
                    if(!string.IsNullOrWhiteSpace(abstract_c.Value)) {
                        model.wt_Content=abstract_c.Value;
                    }
                    modes.Add(model);
                    Console.WriteLine(type+":"+model.wt_Title);
                }
            }
            if(modes.Count>0) {
                Console.WriteLine("采集到新数据准备入库");
                LunWenDal.AddBatch(modes);
                Console.WriteLine("采集到新数据已入库完成");
            } else {
                Console.WriteLine("没有采集到新数据");
            }
        }

获取中文:

       public static string GetChineseWord(string oriText) {
            string x = @"[\u4E00-\u9FFF]+";
            MatchCollection Matches = Regex.Matches(oriText,x,RegexOptions.IgnoreCase);
            StringBuilder sb = new StringBuilder();
            foreach(Match NextMatch in Matches) {
                sb.Append(NextMatch.Value);
            }
            return sb.ToString();
        }

实体类:

public partial class LunWen {
        /// <summary>
        /// 论文网推 LunWen
        /// </summary>
        public LunWen() {

        }

        /// <summary>
        /// wt_id
        /// </summary>
        public int wt_id { get; set; }
        /// <summary>
        /// 单位名称
        /// </summary>
        public string wt_Unit { get; set; }
        /// <summary>
        /// 知网、万文
        /// </summary>
        public string wt_Type { get; set; }
        /// <summary>
        /// 论文标题
        /// </summary>
        public string wt_Title { get; set; }
        /// <summary>
        /// 论文地址
        /// </summary>
        public string wt_Url { get; set; }
        /// <summary>
        /// 类型
        /// </summary>
        public string wt_ResouceType { get; set; }
        /// <summary>
        /// 作者
        /// </summary>
        public string wt_Author { get; set; }
        /// <summary>
        /// 来源
        /// </summary>
        public string wt_Source { get; set; }
        /// <summary>
        /// 期刊
        /// </summary>
        public string wt_Volume { get; set; }
        /// <summary>
        /// 内容
        /// </summary>
        public string wt_Content { get; set; }
        /// <summary>
        /// 下载日期
        /// </summary>

        public DateTime? wt_DownDate { get; set; }
    }

Python采集知网  https://www.cnblogs.com/luomingui/p/17701445.html

posted @ 2022-05-26 14:10  峡谷少爷  阅读(297)  评论(0编辑  收藏  举报