C#爬取国家统计局五级地址
// http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html // 我这里是从省开始往下爬的,如果需要一次性爬取所有省的数据,得改一下从外一层开始爬 // 地址 public string url; // 存储表名 public string dbname; // 省级编码 public string code; // 省名称 public string name; // 数据库名称 public static string database = "TEST"; // 处理连接超时等意外断开情况 public int flag = 0; public void ProcessRequest(HttpContext context) { url= System.Web.HttpUtility.HtmlDecode(System.Web.HttpContext.Current.Request.Form["url"]); TableExist(dbname); Provincial(); City(); County(); Town(); Village(); context.Response.Write("爬取成功"); } public void TableExist(string dbname) { DataTable dt = bll.SelectbySql("SELECT table_name FROM information_schema.TABLES WHERE table_name ='" + dbname + "'"); if (dt.Rows.Count <= 0) { string sql = "USE [" + database + "]\r\n" + "SET ANSI_NULLS ON\r\n" + "SET QUOTED_IDENTIFIER ON\r\n" + "CREATE TABLE[dbo].[" + dbname + "](" + "[ID][int] IDENTITY(1, 1) NOT NULL," + "[Code] [nvarchar] (20) NULL," + "[ParentCode] [nvarchar] (20) NULL," + "[Name] [nvarchar] (50) NULL," + "[Path] [nvarchar] (100) NULL," + "[PathName] [nvarchar] (200) NULL," + "[Levels] [int] NULL," + "[Urls]" + "[nvarchar]" + "(max) NULL," + "[DeleteMark] [bit] NULL," + "CONSTRAINT[PK_" + dbname + "] PRIMARY KEY CLUSTERED" + "(" + "[ID] ASC" + ")WITH(PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON[PRIMARY]" + ") ON[PRIMARY] TEXTIMAGE_ON[PRIMARY]\r\n"; //"GO"; bll.RunbySql(sql); } } public void Provincial() { bll.RunbySql("insert into " + dbname + " values('" + code + "','0','" + name + "','" + code + "','" + name + "',0,'" + url + "',0)"); } public void City() { DataTable dt = bll.SelectbySql("select * from " + dbname + " where DeleteMark=0 and Levels=0 and Urls is not null and Urls<>''"); if (dt.Rows.Count > 0) { for (int i = 0; i < dt.Rows.Count; i++) { String HtmlString = returnHtml(dt.Rows[i]["Urls"].ToString()); NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(HtmlString); //先获取id为artContent的元素,再获取所有的p标签 Elements lists = doc.GetElementsByClass("citytr"); foreach (Element element in lists) { //td节点,包括路径和编码 Element elements_code = element.Children[0]; //td节点,包括路径和名称 Element elements_name = element.Children[1]; if (elements_code.Children.Count > 0) { elements_code = elements_code.Children[0]; elements_name = elements_name.Children[0]; } string newurls = ""; if (elements_code.HasAttr("href")) { string thisUrl = dt.Rows[i]["Urls"].ToString(); newurls = thisUrl.Substring(0, thisUrl.LastIndexOf('/') + 1) + elements_code.Attr("href"); } bll.RunbySql("insert into " + dbname + " values('" + elements_code.Text() + "','"+ dt.Rows[i]["Code"].ToString() + "','" + elements_name.Text() + "','" + dt.Rows[i]["Path"].ToString() + "/" + elements_code.Text() + "','" + dt.Rows[i]["PathName"].ToString() + "/" + elements_name.Text() + "',1,'" + newurls + "',0)"); } } } } public void County() { DataTable dt = bll.SelectbySql("select * from " + dbname + " where DeleteMark=0 and Levels=1 and Urls is not null and Urls<>''"); if (dt.Rows.Count > 0) { for (int i = 0; i < dt.Rows.Count; i++) { String HtmlString = returnHtml(dt.Rows[i]["Urls"].ToString()); NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(HtmlString); //先获取id为artContent的元素,再获取所有的p标签 Elements lists = doc.GetElementsByClass("countytr"); foreach (Element element in lists) { //td节点,包括路径和编码 Element elements_code = element.Children[0]; //td节点,包括路径和名称 Element elements_name = element.Children[1]; if (elements_code.Children.Count > 0) { elements_code = elements_code.Children[0]; elements_name = elements_name.Children[0]; } string newurls = ""; if (elements_code.HasAttr("href")) { string thisUrl = dt.Rows[i]["Urls"].ToString(); newurls = thisUrl.Substring(0, thisUrl.LastIndexOf('/') + 1) + elements_code.Attr("href"); } bll.RunbySql("insert into " + dbname + " values('" + elements_code.Text() + "','" + dt.Rows[i]["Code"].ToString() + "','" + elements_name.Text() + "','" + dt.Rows[i]["Path"].ToString() + "/" + elements_code.Text() + "','" + dt.Rows[i]["PathName"].ToString() + "/" + elements_name.Text() + "',2,'" + newurls + "',0)"); } } } } public void Town() { DataTable dt = bll.SelectbySql("select * from " + dbname + " where DeleteMark=0 and Levels=2 and Urls is not null and Urls<>''"); if (dt.Rows.Count > 0) { for (int i = 0; i < dt.Rows.Count; i++) { String HtmlString = returnHtml(dt.Rows[i]["Urls"].ToString()); NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(HtmlString); //先获取id为artContent的元素,再获取所有的p标签 Elements lists = doc.GetElementsByClass("towntr"); foreach (Element element in lists) { //td节点,包括路径和编码 Element elements_code = element.Children[0]; //td节点,包括路径和名称 Element elements_name = element.Children[1]; if (elements_code.Children.Count > 0) { elements_code = elements_code.Children[0]; elements_name = elements_name.Children[0]; } string newurls = ""; if (elements_code.HasAttr("href")) { string thisUrl = dt.Rows[i]["Urls"].ToString(); newurls = thisUrl.Substring(0, thisUrl.LastIndexOf('/') + 1) + elements_code.Attr("href"); } bll.RunbySql("insert into " + dbname + " values('" + elements_code.Text() + "','" + dt.Rows[i]["Code"].ToString() + "','" + elements_name.Text() + "','" + dt.Rows[i]["Path"].ToString() + "/" + elements_code.Text() + "','" + dt.Rows[i]["PathName"].ToString() + "/" + elements_name.Text() + "',3,'" + newurls + "',0)"); } } } } public void Village() { DataTable dt = bll.SelectbySql("select * from " + dbname + " where DeleteMark=0 and Levels=3 and Urls is not null and Urls<>''"); if (dt.Rows.Count > 0) { for (int i = 0; i < dt.Rows.Count; i++) { String HtmlString = returnHtml(dt.Rows[i]["Urls"].ToString()); NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(HtmlString); //先获取id为artContent的元素,再获取所有的p标签 Elements lists = doc.GetElementsByClass("villagetr"); foreach (Element element in lists) { //td节点,包括路径和编码 Element elements_code = element.Children[0]; //td节点,包括路径和名称 Element elements_name = element.Children[2]; bll.RunbySql("insert into " + dbname + " values('" + elements_code.Text() + "','" + dt.Rows[i]["Code"].ToString() + "','" + elements_name.Text() + "','" + dt.Rows[i]["Path"].ToString() + "/" + elements_code.Text() + "','" + dt.Rows[i]["PathName"].ToString() + "/" + elements_name.Text() + "',4,'',0)"); } } } } public string returnHtml(string Urls) { String HtmlString = ""; try { WebClient webClient = new WebClient(); HtmlString = Encoding.GetEncoding("gb2312").GetString(webClient.DownloadData(Urls)); flag = 0; return HtmlString; } catch { flag++; if (flag <= 10) { return returnHtml(Urls); } else { return HtmlString; } } }
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 【译】Visual Studio 中新的强大生产力特性
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 【设计模式】告别冗长if-else语句:使用策略模式优化代码结构