MongoDBcrud操作,采集部分代码
using System; using System.Collections.Generic; using System.ComponentModel.Design; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Threading.Tasks; using CDPWIB.DAL; using CDPWIB.Data; using CommonUtility; using HtmlAgilityPack; using MongoDB.Driver; using MongoDB.Driver.Builders; using MongoDB.Driver.Linq; using Newtonsoft.Json; using Newtonsoft.Json.Linq; using WebKit; namespace CDPWIB.WebCollection { internal class QiDianCol : INovalCollect { private int Source = Convert.ToInt32(NovalSource.QiDian); private readonly MongoCollection<NovalTempBase> Novalcol = MongoConnectionFactory.GetMongoCollction<NovalTempBase>("Noval", typeof (NovalTempBase)); public void GetNovalTypeTemp() { try { var typecol = MongoConnectionFactory.GetMongoCollction<NovalTypeTemp>("Noval", typeof (NovalTypeTemp)); var subcol = MongoConnectionFactory.GetMongoCollction<NovalSubType>("Noval", typeof (NovalSubType)); // 大类 http://www.qidian.com/Javascript/qidian.bookstore.js?t=20130917 string typeshtml = NetHelper.HttpGet("http://www.qidian.com/Javascript/qidian.bookstore.js?t=20130917") .Replace("/", "") .Replace(" ", "") .Replace("\r", "") .Replace("\n", "") .Replace("\t", "") .Replace("|", "") .Replace(" ", ""); ; string subtypes = NetHelper.HttpGet("http://script.cmfu.com/script/BookStore.js ") .Replace(" ", "") .Replace("\r", "") .Replace("\n", "") .Replace("\t", "") .Replace("|", "") .Replace(" ", ""); ; Match mtype = Regex.Match(typeshtml, "CategoryArr:(.*?)]]", RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline); string typesstring = mtype.Groups[1].Value + "]]"; JArray typearr = (JArray) JsonConvert.DeserializeObject(typesstring); //JsonTextWriter Match msubtype = Regex.Match(subtypes, "SubCategoryArr=(.*?);", RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline); string subtypesstring = msubtype.Groups[1].Value; JArray subarr = (JArray) JsonConvert.DeserializeObject(subtypesstring); List<NovalTypeTemp> lstypes = new List<NovalTypeTemp>(10); //CategoryArr: [["全部", "-1"], ["玄幻", "21"], ["奇幻", "1"], ["武侠", "2"], ["仙侠", "22"], ["都市", "4"], ["历史", "5"], ["军事", "6"], ["游戏", "7"] for (int i = 0; i < typearr.Count; i++) { if (typearr[i][1].ToString() != "-1") { NovalTypeTemp type = new NovalTypeTemp() { WebNum = typearr[i][1].ToString().ToInt(), Name = typearr[i][0].ToString(), Source = Source }; lstypes.Add(type); } } IMongoQuery query = Query<NovalTypeTemp>.EQ(p => p.Source, Source); typecol.Remove(query); typecol.InsertBatch(lstypes); List<NovalSubType> subtypels = new List<NovalSubType>(300); foreach (var NovalTypeTemp in lstypes) { for (int i = 0; i < subarr.Count; i++) { var obj = subarr[i]; if (obj[0].ToString() == NovalTypeTemp.WebNum.ToString()) { NovalSubType subtype = new NovalSubType() { Name = obj[2].ToString(), ParentWebNum = NovalTypeTemp.WebNum, WebNum = obj[1].ToString().ToInt(), Source = Source }; subtypels.Add(subtype); } } } query = Query<NovalSubType>.EQ(p => p.Source, Source); subcol.Remove(query); subcol.InsertBatch(subtypels); } catch (Exception ex) { throw; } } /// <summary> /// 根据点击数页面查小说 /// </summary> public void GetNovals() { //取1到10页 //得到月点击排行小说。 string sourcehtml = string.Empty; HtmlDocument htmldocc = new HtmlDocument(); List<NovalTempBase> qdls = new List<NovalTempBase>(500); for (int j = 1; j < 11; j++) { sourcehtml = NetHelper.HttpGet("http://top.qidian.com/Book/TopDetail.aspx?TopType&Time=2&PageIndex=" + j); ; htmldocc.LoadHtml(sourcehtml); var doc = htmldocc.GetElementbyId("textlist"); //string tablehtml = "<table>" + doc.InnerHtml + "</table>"; // htmldocc.LoadHtml(tablehtml); //一页50列 for (int i = 2; i < 52; i++) { var trdoc = doc.SelectSingleNode("tr[" + i + "]"); //这里的下标,从1算起 var tdtype = trdoc.SelectSingleNode("td[2]/a"); var tdbook = trdoc.SelectSingleNode("td[3]/a[1]"); var tdclick = trdoc.SelectSingleNode("td[4]"); var tdauth = trdoc.SelectSingleNode("td[5]/a"); Match typematch = Regex.Match(tdtype.OuterHtml, "ChannelId=(\\d*?)&SubCategoryId=(\\d*?)'"); Match bookmatck = Regex.Match(tdbook.OuterHtml, "Book/(\\d*?).aspx"); Match authmatch = Regex.Match(tdauth.OuterHtml, "id=(\\d*?)\""); int authid = authmatch.Groups[1].Value.ToInt(); int type = typematch.Groups[1].Value.ToInt(); int subtype = typematch.Groups[2].Value.ToInt(); int booknum = bookmatck.Groups[1].Value.ToInt(); string bookname = tdbook.InnerText.Trim(); //http://image.cmfu.com/books/3127618/3127618.jpg string titleimg = "http://image.cmfu.com/books/" + booknum + "/" + booknum + ".jpg"; bool exist= qdls.Exists(p => p.SourceWebNum == booknum); if (!exist) { NovalTempBase qidian = new NovalTempBase() { AuthName = tdauth.InnerText.Trim(), AuthId = authid, SubType = subtype, TitleImg = titleimg, Title = bookname, TotalClick = tdclick.InnerText.ToInt(), TotalComment = 0, Type = type, SourceWebNum = booknum, Source = Source }; qdls.Add(qidian); } } } PublicMethod.InsertAndUpdateNovalTmp(qdls,Source); } //public void GetNovalsByType() //{ //} /// <summary> /// 得到小说章节 ,个别来源,带分卷。 /// </summary> public void GetNovalChapers() { //http://sight.qq.com/book/chapterpage?uin=0&g_tk=5381&callback=_Callback&pagesize=100&pageno=2&bid=16043&_r=0.6934567329008132 var novalcol = MongoConnectionFactory.GetMongoCollction<NovalTempBase>("Noval", typeof (NovalTempBase)); var books = novalcol.AsQueryable().Where(p=>p.Source==Source).ToList(); foreach (var infoQidian in books) { GetSingleNovalChapers(infoQidian.SourceWebNum); } } public void GetSingleNovalChapers(int novalwebnum) { IMongoQuery q2 = Query<NovalVolumeTemp>.EQ(p => p.Source, Source); IMongoQuery q1 = Query<NovalVolumeTemp>.EQ(p => p.NovalWebNum, novalwebnum); IMongoQuery[] qarray = { q1, q2 }; IMongoQuery query = Query.And(qarray); var chaptercol = MongoConnectionFactory.GetMongoCollction<NovalChapterTemp>("Noval", typeof(NovalChapterTemp)); var volumecol = MongoConnectionFactory.GetMongoCollction<NovalVolumeTemp>("Noval", typeof (NovalVolumeTemp)); List<NovalChapterTemp> lschapters = new List<NovalChapterTemp>(1000); List<NovalVolumeTemp> lsvolumes = new List<NovalVolumeTemp>(10); int chapterorder = 1; int volumeorder = 1; HtmlDocument htmldocc = new HtmlDocument(); //http://read.qidian.com/BookReader/3127618.aspx string sourcehtml = string.Empty; string url = "http://read.qidian.com/BookReader/" + novalwebnum + ".aspx"; try { sourcehtml = NetHelper.HttpGet(url); //目录主页 htmldocc.LoadHtml(sourcehtml); var doc = htmldocc.GetElementbyId("content"); int i = 1; var topdoc = doc.SelectSingleNode("div[" + i + "]"); while (topdoc != null) { var topa = topdoc.SelectSingleNode("div/a"); //如果是vip章节,没有这个A标签。 int topnum; //分卷信息 if (topa != null) { string topahtml = topa.OuterHtml; //href="http://www.qidian.com/BookReader/vol,107580,486625.aspx" Match m = Regex.Match(topahtml, ",(\\d*?).aspx"); topnum = m.Groups[1].Value.ToInt(); var topaname = topdoc.SelectSingleNode("div/b"); string topname = topaname.InnerText.Trim(); topname = topname.Replace(" ", "").Split(';')[1]; //if(topname=="作品相关") NovalVolumeTemp volume = new NovalVolumeTemp() { Sort = volumeorder, WebNum = topnum, Name = topname, NovalWebNum = novalwebnum, Source = Source }; lsvolumes.Add(volume); volumeorder++; } else { topnum = 0; } var contextdoc = doc.SelectSingleNode("div[" + (i + 1) + "]"); var chaperas = contextdoc.SelectNodes("div/ul/li/a"); //<a itemprop='url' href="http://read.qidian.com/BookReader/107580,20901221.aspx" title='凡人修仙传
字数:84 更新时间:2008-08-01 07:54:48'><span itemprop='headline'>呵呵!终于上架了!</span></a> //,(\d*?).aspx string chaptername = string.Empty; //章节信息 int chapterwebnum = 0; for (int x = 0; x < chaperas.Count; x++) { var chapera = chaperas[x]; chaptername = chapera.InnerText.Trim(); Match chapmatchwebnum = Regex.Match(chapera.OuterHtml, ",(\\d*?).aspx"); chapterwebnum = chapmatchwebnum.Groups[1].Value.ToInt(); NovalChapterTemp chapter = new NovalChapterTemp() { Name = chaptername, Sort = chapterorder, WebNum = chapterwebnum, VolumeId = topnum , NovalWebNum = novalwebnum, Source = Source }; lschapters.Add(chapter); chapterorder++; } i += 2; topdoc = doc.SelectSingleNode("div[" + i + "]"); } volumecol.Remove(query); volumecol.InsertBatch(lsvolumes); PublicMethod.InsertChapterTempToSQL(lschapters, Source, novalwebnum); } catch (Exception ex) { return; } } public void GetNovalCilckComment() { var novalcol = MongoConnectionFactory.GetMongoCollction<NovalTempBase>("Noval", typeof(NovalTempBase)); var books = novalcol.AsQueryable().Where(p => p.Source == Source).ToList(); string sourcehtml = string.Empty; string url = string.Empty; HtmlDocument htmldocc = new HtmlDocument(); foreach (var novalTempBase in books) { //http://www.qidian.com/Book/3106580.aspx url = "http://www.qidian.com/Book/" + novalTempBase.SourceWebNum + ".aspx"; sourcehtml = NetHelper.HttpGet(url); htmldocc.LoadHtml(sourcehtml); var cliclickdiv = htmldocc.GetElementbyId("contentdiv"); // /div/div/div[1]/table/tbody/tr/td[1] var clickcount = cliclickdiv.SelectSingleNode("div/div[1]/table/tr/td[1]") .InnerText.Replace("总点击", "") .Replace(":", "").Trim(); int click = Convert.ToInt32(clickcount); // string urlcom = "http://forum.qidian.com/NewForum/List.aspx?BookId=3106580"; ////http://forum.qidian.com/NewForum/List.aspx?BookId=3106580 // // http://c.pingba.qidian.com/BookComment.aspx?BookId=3106580 // url = "http://c.pingba.qidian.com/BookComment.aspx?" + novalTempBase.SourceWebNum; // sourcehtml = NetHelper.HttpGet(url); // htmldocc.LoadHtml(sourcehtml); novalTempBase.TotalClick = click; novalcol.Save(novalTempBase); } //目录主页 } } }