1 using System; 2 using System.Collections.Generic; 3 using System.Linq; 4 using System.Text; 5 using System.Net; 6 using System.IO; 7 using System.Collections.Specialized; 8 using System.Web; 9 10 namespace Common.Helpers 11 { 12 /// <summary> 13 /// 网络访问辅助类 14 /// </summary> 15 public class HttpWebClient : WebClient 16 { 17 #region 公共属性 18 /// <summary> 19 /// 浏览器用户标识,默认采用Chrome的标识 20 /// </summary> 21 public string UserAgent { get; set; } 22 /// <summary> 23 /// Cookie容器 24 /// </summary> 25 public CookieContainer CookieContainer { get; set; } 26 /// <summary> 27 /// 如果 POST 请求需要 100-Continue 响应,则为 true;否则为 false。 28 /// </summary> 29 public bool Expect100Continue { get; set; } 30 31 private WebResponse m_LastWebResponse = null; 32 /// <summary> 33 /// 最后一次的响应对象 34 /// </summary> 35 public WebResponse LastWebResponse { get { return this.m_LastWebResponse; } } 36 37 private int m_Timeout = 120000; 38 /// <summary> 39 /// 超时时间,默认120000毫秒(120秒) 40 /// </summary> 41 public int Timeout 42 { 43 get { return m_Timeout; } 44 set { m_Timeout = value; } 45 } 46 47 private HttpWebClientSetting m_HttpWebClientSetting = null; 48 /// <summary> 49 /// WebClient设置项,该属性始终不会为null 50 /// </summary> 51 public HttpWebClientSetting HttpWebClientSetting 52 { 53 get 54 { 55 if (m_HttpWebClientSetting == null) 56 { 57 m_HttpWebClientSetting = new HttpWebClientSetting(); 58 } 59 return m_HttpWebClientSetting; 60 } 61 set 62 { 63 m_HttpWebClientSetting = value ?? new HttpWebClientSetting(); 64 } 65 } 66 67 68 /// <summary> 69 /// 预处理Web请求对象的委托方法(会在每次获取WebRequest对象后调用),默认值为null 70 /// </summary> 71 public Action<HttpWebRequest> PrepareProcessWebRequest { get; set; } 72 #endregion 73 74 #region 构造方法 75 public HttpWebClient() 76 : this(new CookieContainer()) 77 { 78 } 79 80 public HttpWebClient(CookieContainer cookieContainer) 81 { 82 this.CookieContainer = cookieContainer; 83 this.UserAgent = UserAgentValues.FireFox; 84 this.Expect100Continue = false; 85 } 86 #endregion 87 88 #region 重写方法,增加对CookieContainer的支持 89 protected override WebRequest GetWebRequest(Uri address) 90 { 91 if (!string.IsNullOrEmpty(this.UserAgent)) 92 { 93 this.Headers.Add(HttpRequestHeader.UserAgent, this.UserAgent); 94 } 95 96 WebRequest request = base.GetWebRequest(address); 97 request.Timeout = this.Timeout; 98 99 if (request is HttpWebRequest) 100 { 101 HttpWebRequest httpRequest = request as HttpWebRequest; 102 httpRequest.CookieContainer = this.CookieContainer; 103 httpRequest.ServicePoint.Expect100Continue = this.Expect100Continue; // 取消100-continue 104 105 //读取自定义设置项 106 if (this.HttpWebClientSetting != null) 107 { 108 httpRequest.AllowAutoRedirect = this.HttpWebClientSetting.AllowAutoRedirect; 109 } 110 111 //使用外部委托属性处理Request对象 112 if (this.PrepareProcessWebRequest != null) 113 { 114 this.PrepareProcessWebRequest(httpRequest); 115 } 116 } 117 118 return request; 119 } 120 #endregion 121 122 #region 重写方法,增加对响应对象的访问 123 protected override WebResponse GetWebResponse(WebRequest request) 124 { 125 WebResponse response = base.GetWebResponse(request); 126 this.m_LastWebResponse = response; 127 return response; 128 } 129 #endregion 130 131 #region (public) 向一个URL用POST提交数据,并返回其响应内容 PostData 132 /// <summary> 133 /// 向一个URL用POST提交数据,并返回其响应内容 134 /// ZhangQingFeng 2014-12-14 Add 135 /// EditLog: 136 /// ZhangQingFeng 2015-05-12 Edit 因WebClient的UpdateValues方法中固定为UTF-8格式进行UrlEncode,因此此处需用UploadString方式来间接实现 --见微软WebClient类源码UploadValuesInternal方法中 137 /// </summary> 138 /// <param name="url">请求的URL</param> 139 /// <param name="data">要提交的数据</param> 140 /// <param name="encoding">请求所使用的编码</param> 141 /// <param name="responseEncoding">响应内容所使用的编码,为null时使用请求的编码</param> 142 /// <returns>响应的内容</returns> 143 public string PostData(string url, NameValueCollection data, Encoding encoding, Encoding responseEncoding) 144 { 145 WebClient client = this; 146 147 /* 148 client.Encoding = encoding ?? Encoding.UTF8; 149 150 byte[] response = client.UploadValues(url, "POST", data ?? new NameValueCollection()); 151 152 string html = string.Empty; 153 154 if (responseEncoding == null) 155 { 156 html = client.Encoding.GetString(response); 157 } 158 else 159 { 160 html = responseEncoding.GetString(response); 161 } 162 */ 163 164 client.Encoding = encoding ?? Encoding.UTF8; 165 client.Headers.Add(HttpRequestHeader.ContentType, "application/x-www-form-urlencoded"); 166 167 string delimiter = String.Empty; 168 StringBuilder values = new StringBuilder(); 169 foreach (string name in data.AllKeys) 170 { 171 values.Append(delimiter); 172 values.Append(HttpUtility.UrlEncode(name, encoding)); 173 values.Append("="); 174 values.Append(HttpUtility.UrlEncode(data[name], encoding)); 175 delimiter = "&"; 176 } 177 178 byte[] arrData = client.UploadData(url, "POST", Encoding.ASCII.GetBytes(values.ToString())); 179 string html = (responseEncoding ?? client.Encoding).GetString(arrData); 180 181 return html; 182 } 183 184 /// <summary> 185 /// 向一个URL用POST提交数据,并返回其响应内容 186 /// ZhangQingFeng 2014-12-14 Add 187 /// </summary> 188 /// <param name="url">请求的URL</param> 189 /// <param name="data">要提交的数据</param> 190 /// <param name="encoding">请求和响应所使用的编码</param> 191 /// <returns>响应的内容</returns> 192 public string PostData(string url, NameValueCollection data, Encoding encoding) 193 { 194 return PostData(url, data, encoding, null); 195 } 196 197 /// <summary> 198 /// 向一个URL用POST提交数据,并返回其响应内容(使用this.Encoding来作请求编码和响应编码) 199 /// ZhangQingFeng 2014-12-14 Add 200 /// </summary> 201 /// <param name="url">请求的URL</param> 202 /// <param name="data">要提交的数据</param> 203 /// <returns>响应的内容</returns> 204 public string PostData(string url, NameValueCollection data) 205 { 206 return PostData(url, data, this.Encoding); 207 } 208 #endregion 209 210 #region (public) 向一个URL用POST提交数据,并返回其响应内容 PostData 211 /// <summary> 212 /// 向一个URL用POST提交数据,并返回其响应内容 213 /// ZhangQingFeng 2014-12-14 Add 214 /// </summary> 215 /// <param name="url">请求的URL</param> 216 /// <param name="data">要提交的数据</param> 217 /// <param name="encoding">请求和响应内容所使用的编码</param> 218 /// <returns>响应的内容</returns> 219 public string PostData(string url, Dictionary<string, string> data, Encoding encoding, Encoding responseEncoding) 220 { 221 NameValueCollection postData = new NameValueCollection(); 222 if (data != null) 223 { 224 foreach (var item in data) 225 { 226 postData.Add(item.Key, item.Value); 227 } 228 } 229 return PostData(url, postData, encoding, responseEncoding); 230 } 231 232 233 /// <summary> 234 /// 向一个URL用POST提交数据,并返回其响应内容 235 /// ZhangQingFeng 2014-12-14 Add 236 /// </summary> 237 /// <param name="url">请求的URL</param> 238 /// <param name="data">要提交的数据</param> 239 /// <param name="encoding">请求和响应所使用的编码</param> 240 /// <returns>响应的内容</returns> 241 public string PostData(string url, Dictionary<string, string> data, Encoding encoding) 242 { 243 return PostData(url, data, encoding, null); 244 } 245 246 /// <summary> 247 /// 向一个URL用POST提交数据,并返回其响应内容(使用this.Encoding来作请求编码和响应编码) 248 /// ZhangQingFeng 2014-12-14 Add 249 /// </summary> 250 /// <param name="url">请求的URL</param> 251 /// <param name="data">要提交的数据</param> 252 /// <returns>响应的内容</returns> 253 public string PostData(string url, Dictionary<string, string> data) 254 { 255 return PostData(url, data, this.Encoding); 256 } 257 #endregion 258 259 #region 辅助类 260 /// <summary> 261 /// 浏览器用户标识类 262 /// </summary> 263 public class UserAgentValues 264 { 265 public static readonly string FireFox = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:37.0) Gecko/20100101 Firefox/37.0"; 266 public static readonly string Chrome = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36"; 267 public static readonly string IE8 = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2;)"; 268 } 269 #endregion 270 } 271 272 /// <summary> 273 /// HttpWebClient对象设置类 274 /// </summary> 275 public class HttpWebClientSetting 276 { 277 private bool m_AllowAutoRedirect = true; 278 /// <summary> 279 /// 当响应内容为重定向时客户端是否自动重定向(如果该属性为true,则取到的响应则为重定向后的内容,否则则为响应原文),默认值为true 280 /// </summary> 281 public bool AllowAutoRedirect 282 { 283 get { return m_AllowAutoRedirect; } 284 set { m_AllowAutoRedirect = value; } 285 } 286 } 287 }
在做页面抓取的过程中,发现自带的WebClient不够灵活,因此做了一个实现。
关于在PostData方法中不使用UploadValues()方法的原因:
1.查看微软的源代码实现时发现,无论设置请求时的Encoding是否为GB2312,在使用WebClient的UploadValues()上传内容时,其内在都是使用UTF-8编码进行UrlEncode,因此传到服务端中的数据中若包含有中文时则一定会乱码,因此重写PostData以规避此问题。
关于HttpWebClientSetting中的AllowAutoRedirect属性:
在WebClient发起请求时,若响应内容为重定向,则WebClient会自动做重定向,因此该类提供此设置项以控制在访问时是否自动做重定向(第二次访问Refer后的网站时会将请求中的Refer头置空,将该AllowAutoRedirect设置为false,然后手动从Response.Header中取出Location对象地址,设置Refer后再访问,则可真实模拟浏览器访问,从而避开一些网站的防抓取设置)
关于HttpWebClient中的LastWebResponse属性:
当存在多次重定向时,系统记录了最后一次返回的内容,从此内容的Header中取出ResponseUri,则可以取到最后返回响应的页面真实地址,从而为下一次的设置请求Refer头作准备。
大约就是如此,后期如有Bug会继续更新。