博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

解析网页源码方式

Posted on 2015-01-15 18:22  system_kk  阅读(1022)  评论(0编辑  收藏  举报

 

解析HTML源码

1,正则获取:

 1 string str_pattern = "(?<FlightNo>[A-Z]{2}[0-9]{4})\\s*(?<Cabin>[A-Z0-9]{1,2})\\s*(?<FlghtDate>[A-Z]{2}[0-9]{2}[A-Z]{3})\\s*(?<FromTo>[A-Z]{6})\\s*(?<Statu>[A-Za-z]{2}[0-9]{1})\\s*(?<FromDt>[0-9]{4})\\s*(?<ToDt>[0-9]{4})";
 2                 Regex regex = new Regex(str_pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.CultureInvariant);
 3                 if (regex.IsMatch(str_html_part2))
 4                 {
 5                     string FlightNo = "";
 6                     string Cabin = "";
 7                     string FlghtDate = "";
 8                     string FromTo = "";
 9                     string Statu = "";
10                     string FromDt = "";
11                     string ToDt = "";
12                     MatchCollection matchCollection = regex.Matches(str_html_part2);
13                     foreach (Match match in matchCollection)
14                     {
15                         FlightNo = match.Groups["FlightNo"].Value.Trim();
16                         Cabin = match.Groups["Cabin"].Value.Trim();
17                         FlghtDate = match.Groups["FlghtDate"].Value.Trim();
18                         FromTo = match.Groups["FromTo"].Value.Trim();
19                         Statu = match.Groups["Statu"].Value.Trim();
20                         FromDt = match.Groups["FromDt"].Value.Trim();
21                         ToDt = match.Groups["ToDt"].Value.Trim();
22                         //public bool IsExistFlight(string allcout,string cabin,string FromDt,string ToDt, string sp_code, string fromcity, string tocity, string fromdate)
23                     }
24                      
25                 }

 

1  if (!Regex.IsMatch(match.Groups["Result"].Value.ToString(), @"^[0-9]*$"))
View Code

 



Regex regex = new Regex(strPattern, RegexOptions.IgnoreCase | RegexOptions.Multiline); if (regex.IsMatch(htmlContent)) { MatchCollection matchCollection = regex.Matches(htmlContent); foreach (Match match in matchCollection) { string value = match.Value;//获取到的 } }

 

测试解析PNR航班 

rt编码信息 如: 

RTAAAAAA                                                                       
 1.MENG/HONG MS 2.ZHANG/DEPING MR 3.ZHANG/MUHAN MS AAAAAA                       
 4.  NH964  W   TU18JUL  PEKHND HK3   0825 1250      SEAME  3 I                 
 5.  NH963  V   MO24JUL  HNDPEK HK3   1720 2010      SEAME  I 3                 
 6.SZX/T SZX/T 0755-82819601/SHENZHEN TIANTAI AIR INTERNATIONAL TRAVEL AGENCY   
    CO.,LTD ABCDEFG                                                             
 7.18912790711                                                                  
 8.18912790711                                                                  
 9.TL/0625/18JUL/SZX000                                                         
10.SSR ADTK 1E TO NH BY 30JUN 1200 OTHERWISE WILL BE XLD                        
11.SSR DOCS NH HK1 P/CN/G42027462/CN/13OCT68/F/27APR20/MENG/HONG/P1             
12.SSR DOCS NH HK1 P/CN/G40834536/CN/08SEP66/M/25MAY20/ZHANG/DEPING/P2          
13.SSR DOCS NH HK1 P/CN/E81525458/CN/07MAY99/F/19JUL26/ZHANG/MUHAN/P3          +
                                                                               
                                                                                
                                                                                
PN                                                                             
14.SSR CTCM NH HK1 18912790729/P3                                              -
15.OSI NH CTCT18912790729                                                       
16.RMK TJ AUTH SZV000/T                                                         
17.RMK 备注信息                                                                 
18.RMK 1A/M42ROX                                                                
19.SZX000 

  

解析方法

  1 public OrderView GetOrderViewByRtPNRTxt(string pnrtxt, ref string msg)
  2         {
  3             ILogHandle handle = new ILogHandle(userid, "1E", "SELF", "解析RTPNR");
  4             DateTime _now = DateTime.Now;
  5             int restime = 0;
  6 
  7             OrderView result = new OrderView();
  8             HttpUtils http = new HttpUtils();
  9             string pnr = "";
 10             string strreq = pnrtxt;
 11             string strrsp = "";
 12             Regex rex = new Regex("\\s*(MR|MS|MRS|MISS)\\s*(?<PNR>[a-zA-Z0-9]{6})", RegexOptions.IgnoreCase | RegexOptions.CultureInvariant | RegexOptions.Multiline);
 13             if (!rex.IsMatch(pnrtxt))
 14             {
 15                 msg = "未解析到PNR";
 16                 restime = Convert.ToInt32((DateTime.Now - _now).TotalMilliseconds);
 17                 handle.Error(strreq, strrsp, "未解析到PNR", restime);
 18                 return result;
 19             }
 20             pnr = rex.Matches(pnrtxt)[0].Groups["PNR"].Value.Trim();
 21             int seq = 1;
 22             try
 23             {
 24                 string str_pattern = "(?<FlightNo>[0-9A-Z]{2}[0-9]{1,4})\\s*(?<Cabin>[A-Z0-9]{1,2})\\s*(?<FlghtDate>[A-Z]{2}[0-9]{2}[A-Z]{3})\\s*(?<FromTo>[A-Z]{6})\\s*(?<Statu>[A-Za-z]{2}[0-9]{1})\\s*(?<FromDt>[0-9]{4}[+]?[1-9]?)\\s*(?<ToDt>[0-9]{4}[+]?[1-9]?)";
 25                 Regex regex = new Regex(str_pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.CultureInvariant);
 26                 if (!regex.IsMatch(pnrtxt))
 27                 {
 28                     msg = "未匹配到任何航班信息";
 29                     restime = Convert.ToInt32((DateTime.Now - _now).TotalMilliseconds);
 30                     handle.Error(strreq, strrsp, "未匹配到任何航班信息", restime);
 31                     return result;
 32                 }
 33                 result.flights = new List<FlightModel>();
 34                 string FlghtDate = "";
 35                 string FromTo = "";
 36                 DateTime dtfdate = DateTime.Now;
 37                 DateTime dttrgar = DateTime.Now;
 38 
 39                 MatchCollection matchCollection = regex.Matches(pnrtxt);
 40                 foreach (Match match in matchCollection)
 41                 {
 42                     FlghtDate = match.Groups["FlghtDate"].Value.Trim();
 43                     FromTo = match.Groups["FromTo"].Value.Trim();
 44                     FlightModel f = new FlightModel();
 45 
 46                     dttrgar = Convert.ToDateTime(DateTime.Now.Year.ToString() + "-" + MakePnrText.GetMonth(FlghtDate.Substring(4)) + "-" + FlghtDate.Substring(2, 2));
 47                     if (dttrgar < DateTime.Now)
 48                         dttrgar = dttrgar.AddYears(1);
 49 
 50                     string fdate = dttrgar.ToString("yyyy-MM-dd");
 51                     if (seq == 1)
 52                     {
 53                         dtfdate = Convert.ToDateTime(fdate);
 54                     }
 55                     f.flightno = match.Groups["FlightNo"].Value.Trim();
 56                     f.seat = match.Groups["Cabin"].Value.Trim().Substring(0, 1);
 57                     f.carrier = f.flightno.Substring(0, 2);
 58                     f.dept = FromTo.Substring(0, 3);
 59                     f.depttime = GetDatetime(fdate, match.Groups["FromDt"].Value.Trim());
 60                     f.arr = FromTo.Substring(3);
 61                     f.arrtime = GetDatetime(fdate, match.Groups["ToDt"].Value.Trim());
 62                     f.sailtype = seq;
 63                     f.triptype = (f.depttime.Value - dtfdate).TotalDays >= 2 ? 2 : 1;
 64                     f.optcarrier = f.carrier;
 65                     f.optflightno = f.flightno;
 66                     f.state = "Y";
 67                     result.flights.Add(f);
 68                     seq++;
 69                 }
 70             }
 71             catch (Exception)
 72             {
 73                 msg = "" + seq + "段航班信息有误,请核实RT编码文本信息";
 74                 restime = Convert.ToInt32((DateTime.Now - _now).TotalMilliseconds);
 75                 handle.Error(strreq, strrsp, msg, restime);
 76                 return null;
 77             }
 78 
 79             result.passes = new List<PassengerModel>();
 80             seq = 1;
 81             DateTime dtnow = DateTime.Now;
 82             try
 83             {
 84                 string str_pass = "[.]?(?<name>[A-Z]{2,}/[A-Z]{1,}\\s?[A-Z]{0,})\\s*(?<sex>MR|MS|MRS|MISS)\\s*(" + pnr + ")?";
 85                 rex = new Regex(str_pass, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.CultureInvariant);
 86                 if (!rex.IsMatch(pnrtxt))
 87                 {
 88                     msg = "未匹配到任何乘客信息";
 89                     restime = Convert.ToInt32((DateTime.Now - _now).TotalMilliseconds);
 90                     handle.Error(strreq, strrsp, "未匹配到任何乘客信息", restime);
 91                     return result;
 92                 }
 93 
 94                 MatchCollection matches = rex.Matches(pnrtxt);
 95                 foreach (Match match in matches)
 96                 {
 97                     string name = match.Groups["name"].Value.Trim();
 98                     string sex = match.Groups["sex"].Value.Trim();
 99                     if (string.IsNullOrEmpty(name) || string.IsNullOrEmpty(sex))
100                     {
101                         msg = "" + seq + "位乘客信息有误,请核实RT编码文本信息";
102                         restime = Convert.ToInt32((DateTime.Now - _now).TotalMilliseconds);
103                         handle.Error(strreq, strrsp, msg, restime);
104                         return null;
105                     }
106                     PassengerModel p = new PassengerModel();
107                     p.name = name;
108                     p.sex = sex == "MR" ? "M" : "F";
109                     p.phone = "";
110                     p.nationality = "CN";
111                     p.birthday = dtnow.AddYears(new Random().Next(-20, -13));
112                     p.cardaddress = "CN";
113                     p.cardexpire = dtnow.AddYears(new Random().Next(2, 10));
114                     p.cardno = "P88888";
115                     p.cardtype = "PP";
116                     p.ptype = 1;
117                     result.passes.Add(p);
118                     if (seq == 1)
119                         result.needpassinfo = "1";
120                     seq++;
121                 }
122                 //SSR DOCS NH HK1 P/CN/E81525458/CN/07MAY99/F/19JUL26/ZHANG/MUHAN/P3  
123                 string str_pinfo = "(.SSR DOCS [0-9A-Z]{2} [0-9A-Z]{2}1 P/)(?<cardaddress>[A-Z]{2})/(?<cardno>[0-9A-Z]{2,})/(?<nationality>[A-Z]{2})/(?<birthday>[0-9A-Z]{7})/(?<sex>[A-Z]{1})/(?<cardexpire>[0-9A-Z]{7})/(?<name>[A-Z]{2,}(/)[A-Z]{1,}\\s?[A-Z]{0,})(/H)?/P[0-9]{1,}\\s*";
124                 rex = new Regex(str_pinfo, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.CultureInvariant);
125                 if (rex.IsMatch(pnrtxt))
126                 {
127                     matches = rex.Matches(pnrtxt);
128                     seq = 0;
129                     int nowsyear = int.Parse(DateTime.Now.Year.ToString().Substring(3));
130                     string yearpart = "19";
131                     foreach (Match match in matches)
132                     {
133                         string birthday = match.Groups["birthday"].Value.Trim();
134                         string cardexpire = match.Groups["cardexpire"].Value.Trim();
135                         if (result.passes[seq].name == match.Groups["name"].Value.Trim())
136                         {
137                             if (int.Parse(birthday.Substring(5)) < nowsyear)
138                                 yearpart = "20";
139                             result.passes[seq].cardaddress = match.Groups["cardaddress"].Value.Trim();
140                             result.passes[seq].cardno = match.Groups["cardno"].Value.Trim();
141                             result.passes[seq].nationality = match.Groups["nationality"].Value.Trim();
142                             result.passes[seq].sex = match.Groups["sex"].Value.Trim();
143                             result.passes[seq].birthday = Convert.ToDateTime(yearpart + birthday.Substring(5) + "-" + MakePnrText.GetMonth(birthday.Substring(2, 3)) + "-" + birthday.Substring(0, 2));
144                             result.passes[seq].cardexpire = Convert.ToDateTime("20" + cardexpire.Substring(5) + "-" + MakePnrText.GetMonth(cardexpire.Substring(2, 3)) + "-" + cardexpire.Substring(0, 2));
145                         }
146                         seq++;
147                     }
148                 }
149 
150             }
151             catch (Exception)
152             {
153                 msg = "" + seq + "位乘客信息有误,请核实RT编码文本";
154                 restime = Convert.ToInt32((DateTime.Now - _now).TotalMilliseconds);
155                 handle.Error(strreq, strrsp, msg, restime);
156                 return null;
157             }
158 
159             result.extemmsg = pnr;
160             restime = Convert.ToInt32((DateTime.Now - _now).TotalMilliseconds);
161             handle.Succes(strreq, strrsp, restime);
162             return result;
163         }
View Code

 

2,HtmlAgilityPack 之 HtmlNode类 (主要是XPath语法解析,firebug插件可以查看对应XPath)

string detailContext="html 源码";
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
            doc.LoadHtml(detailContext);
            HtmlNode node = doc.DocumentNode;
            HtmlNodeCollection trlist = node.SelectNodes("//table[@class='tab_result']//tr[@class='line'][@height='40']");

 

3,Newtonsoft.Json序列化和反序列

这里下载:http://www.newtonsoft.com/products/json/ 
 
1  List<Models.实体类> list=Newtonsoft.Json.JsonConvert.DeserializeObject<List<Models.实体类>>(context);
View Code

 

 1  using (StreamReader reader = new StreamReader(stream))
 2             {
 3                 string jsonData = reader.ReadToEnd();
 4                 // 解析JSON,分析JSON
 5                 JObject objectRoot = JsonConvert.DeserializeObject(jsonData) as JObject;
 6                 JArray imgsArray = objectRoot["imgs"] as JArray;
 7                 for (int i = 0; i < imgsArray.Count; i++)
 8                 {
 9                     JObject img = imgsArray[i] as JObject;
10                     string objUrl = (string)img["objURL"];
11                     //txtLogs.AppendText(objUrl + Environment.NewLine); // 测试获取图片路径
12                     try
13                     {
14                         // 下载具体的某一张图片
15                         DownloadImage(objUrl);
16                         // 更新进度条
17                         progressBar.BeginInvoke(new Action(() =>
18                             {
19                                 progressBar.Value = i * 100 / sumCount;
20                             }));
21                         // 更新文本框
22                         txtLogs.BeginInvoke(new Action(() =>
23                             {
24                                 txtLogs.AppendText("已下载:" + objUrl + Environment.NewLine);
25                             }));
26                     }
27                     catch (Exception ex)
28                     {
29                         // 跨线程访问UI线程的txtLogs控件
30                         txtLogs.BeginInvoke(new Action(() =>
31                             {
32                                 txtLogs.AppendText("【异常:" + ex.Message + "" + Environment.NewLine);
33                             }));
34                     }
35                 }
36             }
View Code

 

1 Regex _rexPC = new Regex(@"([\d]{1,})件");
2 string str="成都-昆明,1件,每件23KG,长宽高100*60*40CM;昆明-万象,1件,每件23KG,长宽高100*60*40CM.万象-昆明,1件,每件23KG,长宽高100*60*40CM;昆明-成都,1件,每件23KG,长宽高100*60*40CM";
3   if (_rexPC.IsMatch(str))
4                     {
5                         var _mch = _rexPC.Match(str);
6                         int adtpc = StringHelper.StrToInt(_mch.Groups[1].Value, 0);
7                     }
View Code

 

有时候获取json数据要解析时需手动写实体类,之前一直手写,感觉太浪费时间了,后面找到了一款工具,可以实现转换功能。