需求:客户的数据同时存在在另外一个不可控的系统中,需要和当前系统同步。
思路:自动登录另外一个系统,然后抓取数据,同步到本系统中。
技术点:模拟用户登录;保存登录状态;抓取数据
程序非常简单:
/// <summary>
/// visit the target url
/// </summary>
/// <param name="targetURL"></param>
/// <param name="cc">this is for keeping cookies and sessions</param>
/// <param name="param">this is the data need post inside form</param>
/// <returns>html page</returns>
public static string PostAndGetHTML(string targetURL,CookieContainer cc, Hashtable param)
{
//prepare the submit data
string formData = "";
foreach (DictionaryEntry de in param)
{
formData += de.Key.ToString() + "=" + de.Value.ToString()+"&";
}
if(formData.Length>0)
formData = formData.Substring(0, formData.Length - 1); //remove last '&'
ASCIIEncoding encoding = new ASCIIEncoding();
byte[] data = encoding.GetBytes(formData);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(targetURL);
request.Method = "POST"; //post
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = data.Length;
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 2.0.1124)";
Stream newStream = request.GetRequestStream();
newStream.Write(data, 0, data.Length);
newStream.Close();
request.CookieContainer = cc;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
cc.Add(response.Cookies);
Stream stream = response.GetResponseStream();
string result = new StreamReader(stream, System.Text.Encoding.Default).ReadToEnd();
return result;
}
这一个是调用的例子:先登录,在查询。 实际中这个逻辑可能有很多步骤
private void button2_Click(object sender, EventArgs e)
{
CookieContainer cc = new CookieContainer();//this is for keep the Session and Cookie
Hashtable param = new Hashtable();//this is for keep post data.
string urlLogin = "http://demo.server//login.asp";
//do find the elementId that needed. check the source of login page can get this information
param.Add("User", "xxx");
param.Add("Password", "xxxx");
string result = PostAndGetHTML(urlLogin, cc, param);
//check result, whether login success
//if login success, goto the target url, and input some value.
string url2 = " http://demo.server/query.asp?id=1";// need change. special logic
param.Clear();
//param.Add("SearchAreaId","JobId")
result = PostAndGetHTML(url2, cc, new Hashtable());
//ConvertToDT the html or do something others
}
这是一个简单的抓取网页数据的函数(针对Table内的,直接转化成DataTable
)
private DataTable ConvertToDT(DataTable dt, string tableHTML)
{
int lastTD = tableHTML.ToLower().LastIndexOf("</td>");
int firstRow = tableHTML.ToLower().IndexOf("<tr") + 3;//after ""<tr
int index = tableHTML.ToLower().IndexOf("<tr", firstRow) + 3;//after ""<tr
while (index < lastTD)
{
DataRow dr = dt.NewRow();
for (int i = 0; i < dt.Columns.Count; i++)
{
string value = "";
int startTD = tableHTML.ToLower().IndexOf("<td", index) + 3;//after "<td"
int endTD = tableHTML.ToLower().IndexOf("</td>", startTD);
if (endTD < 0)
break;
string tdStr = tableHTML.Substring(startTD, endTD - startTD);
//remove <> and others
tdStr = tdStr.Replace(" ", "").Replace("\t", "").Replace("\r", "");
string[] v = tdStr.Split('<', '>');
for (int j = 0; j < v.Length; j++)
{
j++;
if (v[j].Trim() != "")
{
value = v[j].Trim();
break;
}
}
//
dr[i] = value;
index = endTD;
}
dt.Rows.Add(dr);
}
return dt;
}
注:对于有验证码登录系统的无效。(如果该系统的验证码放到cookie中存储的例外,这个容易破解)