C#自动登录网页浏览页面 抓取数据
http://www.cnblogs.com/netwom/articles/953430.html public static string PostAndGetHTML(string targetURL,CookieContainer cc, Hashtable param) { //prepare the submit data string formData = ""; foreach (DictionaryEntry de in param) { formData += de.Key.ToString() + "=" + de.Value.ToString()+"&"; } if(formData.Length>0) formData = formData.Substring(0, formData.Length - 1); //remove last '&'
ASCIIEncoding encoding = new ASCIIEncoding(); byte[] data = encoding.GetBytes(formData);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(targetURL); request.Method = "POST"; //post request.ContentType = "application/x-www-form-urlencoded"; request.ContentLength = data.Length; request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 2.0.1124)";
Stream newStream = request.GetRequestStream(); newStream.Write(data, 0, data.Length);
newStream.Close();
request.CookieContainer = cc; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); cc.Add(response.Cookies); Stream stream = response.GetResponseStream(); string result = new StreamReader(stream, System.Text.Encoding.Default).ReadToEnd(); return result; } private void button2_Click(object sender, EventArgs e) { CookieContainer cc = new CookieContainer();//this is for keep the Session and Cookie Hashtable param = new Hashtable();//this is for keep post data.
string urlLogin = "http://demo.server//login.asp"; //do find the elementId that needed. check the source of login page can get this information param.Add("User", "xxx"); param.Add("Password", "xxxx"); string result = PostAndGetHTML(urlLogin, cc, param); //check result, whether login success //if login success, goto the target url, and input some value. string url2 = " http://demo.server/query.asp?id=1";// need change. special logic param.Clear(); //param.Add("SearchAreaId","JobId") result = PostAndGetHTML(url2, cc, new Hashtable()); //ConvertToDT the html or do something others
} 这是一个简单的抓取网页数据的函数(针对Table内的,直接转化成DataTable private DataTable ConvertToDT(DataTable dt, string tableHTML) { int lastTD = tableHTML.ToLower().LastIndexOf("</td>"); int firstRow = tableHTML.ToLower().IndexOf("<tr") + 3;//after ""<tr int index = tableHTML.ToLower().IndexOf("<tr", firstRow) + 3;//after ""<tr while (index < lastTD) { DataRow dr = dt.NewRow(); for (int i = 0; i < dt.Columns.Count; i++) { string value = ""; int startTD = tableHTML.ToLower().IndexOf("<td", index) + 3;//after "<td" int endTD = tableHTML.ToLower().IndexOf("</td>", startTD); if (endTD < 0) break; string tdStr = tableHTML.Substring(startTD, endTD - startTD);
tdStr = tdStr.Replace(" ", "").Replace("\t", "").Replace("\r", ""); string[] v = tdStr.Split('<', '>'); for (int j = 0; j < v.Length; j++) { j++; if (v[j].Trim() != "") { value = v[j].Trim(); break; } } dr[i] = value; index = endTD; } dt.Rows.Add(dr);
} return dt; |
posted on 2011-04-24 18:37 lijinchang 阅读(493) 评论(0) 编辑 收藏 举报