C#爬取国家统计局五级地址
// http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html // 我这里是从省开始往下爬的,如果需要一次性爬取所有省的数据,得改一下从外一层开始爬 // 地址 public string url; // 存储表名 public string dbname; // 省级编码 public string code; // 省名称 public string name; // 数据库名称 public static string database = "TEST"; // 处理连接超时等意外断开情况 public int flag = 0; public void ProcessRequest(HttpContext context) { url= System.Web.HttpUtility.HtmlDecode(System.Web.HttpContext.Current.Request.Form["url"]); TableExist(dbname); Provincial(); City(); County(); Town(); Village(); context.Response.Write("爬取成功"); } public void TableExist(string dbname) { DataTable dt = bll.SelectbySql("SELECT table_name FROM information_schema.TABLES WHERE table_name ='" + dbname + "'"); if (dt.Rows.Count <= 0) { string sql = "USE [" + database + "]\r\n" + "SET ANSI_NULLS ON\r\n" + "SET QUOTED_IDENTIFIER ON\r\n" + "CREATE TABLE[dbo].[" + dbname + "](" + "[ID][int] IDENTITY(1, 1) NOT NULL," + "[Code] [nvarchar] (20) NULL," + "[ParentCode] [nvarchar] (20) NULL," + "[Name] [nvarchar] (50) NULL," + "[Path] [nvarchar] (100) NULL," + "[PathName] [nvarchar] (200) NULL," + "[Levels] [int] NULL," + "[Urls]" + "[nvarchar]" + "(max) NULL," + "[DeleteMark] [bit] NULL," + "CONSTRAINT[PK_" + dbname + "] PRIMARY KEY CLUSTERED" + "(" + "[ID] ASC" + ")WITH(PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON[PRIMARY]" + ") ON[PRIMARY] TEXTIMAGE_ON[PRIMARY]\r\n"; //"GO"; bll.RunbySql(sql); } } public void Provincial() { bll.RunbySql("insert into " + dbname + " values('" + code + "','0','" + name + "','" + code + "','" + name + "',0,'" + url + "',0)"); } public void City() { DataTable dt = bll.SelectbySql("select * from " + dbname + " where DeleteMark=0 and Levels=0 and Urls is not null and Urls<>''"); if (dt.Rows.Count > 0) { for (int i = 0; i < dt.Rows.Count; i++) { String HtmlString = returnHtml(dt.Rows[i]["Urls"].ToString()); NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(HtmlString); //先获取id为artContent的元素,再获取所有的p标签 Elements lists = doc.GetElementsByClass("citytr"); foreach (Element element in lists) { //td节点,包括路径和编码 Element elements_code = element.Children[0]; //td节点,包括路径和名称 Element elements_name = element.Children[1]; if (elements_code.Children.Count > 0) { elements_code = elements_code.Children[0]; elements_name = elements_name.Children[0]; } string newurls = ""; if (elements_code.HasAttr("href")) { string thisUrl = dt.Rows[i]["Urls"].ToString(); newurls = thisUrl.Substring(0, thisUrl.LastIndexOf('/') + 1) + elements_code.Attr("href"); } bll.RunbySql("insert into " + dbname + " values('" + elements_code.Text() + "','"+ dt.Rows[i]["Code"].ToString() + "','" + elements_name.Text() + "','" + dt.Rows[i]["Path"].ToString() + "/" + elements_code.Text() + "','" + dt.Rows[i]["PathName"].ToString() + "/" + elements_name.Text() + "',1,'" + newurls + "',0)"); } } } } public void County() { DataTable dt = bll.SelectbySql("select * from " + dbname + " where DeleteMark=0 and Levels=1 and Urls is not null and Urls<>''"); if (dt.Rows.Count > 0) { for (int i = 0; i < dt.Rows.Count; i++) { String HtmlString = returnHtml(dt.Rows[i]["Urls"].ToString()); NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(HtmlString); //先获取id为artContent的元素,再获取所有的p标签 Elements lists = doc.GetElementsByClass("countytr"); foreach (Element element in lists) { //td节点,包括路径和编码 Element elements_code = element.Children[0]; //td节点,包括路径和名称 Element elements_name = element.Children[1]; if (elements_code.Children.Count > 0) { elements_code = elements_code.Children[0]; elements_name = elements_name.Children[0]; } string newurls = ""; if (elements_code.HasAttr("href")) { string thisUrl = dt.Rows[i]["Urls"].ToString(); newurls = thisUrl.Substring(0, thisUrl.LastIndexOf('/') + 1) + elements_code.Attr("href"); } bll.RunbySql("insert into " + dbname + " values('" + elements_code.Text() + "','" + dt.Rows[i]["Code"].ToString() + "','" + elements_name.Text() + "','" + dt.Rows[i]["Path"].ToString() + "/" + elements_code.Text() + "','" + dt.Rows[i]["PathName"].ToString() + "/" + elements_name.Text() + "',2,'" + newurls + "',0)"); } } } } public void Town() { DataTable dt = bll.SelectbySql("select * from " + dbname + " where DeleteMark=0 and Levels=2 and Urls is not null and Urls<>''"); if (dt.Rows.Count > 0) { for (int i = 0; i < dt.Rows.Count; i++) { String HtmlString = returnHtml(dt.Rows[i]["Urls"].ToString()); NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(HtmlString); //先获取id为artContent的元素,再获取所有的p标签 Elements lists = doc.GetElementsByClass("towntr"); foreach (Element element in lists) { //td节点,包括路径和编码 Element elements_code = element.Children[0]; //td节点,包括路径和名称 Element elements_name = element.Children[1]; if (elements_code.Children.Count > 0) { elements_code = elements_code.Children[0]; elements_name = elements_name.Children[0]; } string newurls = ""; if (elements_code.HasAttr("href")) { string thisUrl = dt.Rows[i]["Urls"].ToString(); newurls = thisUrl.Substring(0, thisUrl.LastIndexOf('/') + 1) + elements_code.Attr("href"); } bll.RunbySql("insert into " + dbname + " values('" + elements_code.Text() + "','" + dt.Rows[i]["Code"].ToString() + "','" + elements_name.Text() + "','" + dt.Rows[i]["Path"].ToString() + "/" + elements_code.Text() + "','" + dt.Rows[i]["PathName"].ToString() + "/" + elements_name.Text() + "',3,'" + newurls + "',0)"); } } } } public void Village() { DataTable dt = bll.SelectbySql("select * from " + dbname + " where DeleteMark=0 and Levels=3 and Urls is not null and Urls<>''"); if (dt.Rows.Count > 0) { for (int i = 0; i < dt.Rows.Count; i++) { String HtmlString = returnHtml(dt.Rows[i]["Urls"].ToString()); NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(HtmlString); //先获取id为artContent的元素,再获取所有的p标签 Elements lists = doc.GetElementsByClass("villagetr"); foreach (Element element in lists) { //td节点,包括路径和编码 Element elements_code = element.Children[0]; //td节点,包括路径和名称 Element elements_name = element.Children[2]; bll.RunbySql("insert into " + dbname + " values('" + elements_code.Text() + "','" + dt.Rows[i]["Code"].ToString() + "','" + elements_name.Text() + "','" + dt.Rows[i]["Path"].ToString() + "/" + elements_code.Text() + "','" + dt.Rows[i]["PathName"].ToString() + "/" + elements_name.Text() + "',4,'',0)"); } } } } public string returnHtml(string Urls) { String HtmlString = ""; try { WebClient webClient = new WebClient(); HtmlString = Encoding.GetEncoding("gb2312").GetString(webClient.DownloadData(Urls)); flag = 0; return HtmlString; } catch { flag++; if (flag <= 10) { return returnHtml(Urls); } else { return HtmlString; } } }