将网页上word、pdf、txt文件下载下来,解析成文本内容
1.第一步解析网页
首先创建个HttpWebRequest,
创建HttpWebRequest
1 /// <summary>
2 /// 创建HttpWebRequest
3 /// </summary>
4 /// <param name="url">访问路径</param>
5 /// <param name="cookies">包含已验证用户信息的cookie</param>
6 /// <returns></returns>
7 public static HttpWebRequest CreateHttpWebRequest(string url, CookieCollection cookies)
8 {
9 HttpWebRequest webRequest = WebRequest.Create(url) as HttpWebRequest;
10 webRequest.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.2; zh-CN; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10";
11 webRequest.Accept = "*/*";
12 webRequest.ContentType = "*/*";
13 webRequest.Referer = "http://www.java.com.cn";
14 webRequest.CookieContainer = new CookieContainer();
15 if (cookies != null)
16 webRequest.CookieContainer.Add(cookies);
17 return webRequest;
18
19 }
再得到它的
HttpWebResponse res = (HttpWebResponse)req.GetResponse();
2.判断网址内容是不是附件的形式,HttpWebResponse有个ContentType如果是application/x-msdownload,则说明网址内容是附件的形式。然后根据Headers就可以得到附件的格式(word、pdf或txt)
网页内容格式
if(res.ContentType == "application/x-msdownload")
{
string header = res.Headers.GetValues(0)[0]; //header
string type = header.Substring(header.Length - 3, 3) // 附件格式
}
3.然后下载附件
下载文件
1 /// <summary>
2 /// 下载文件
3 /// </summary>
4 /// <param name="strHref">url地址</param>
5 /// <param name="filePath">文件存放地址</param>
6 public bool DownLoadFile(string strHref, string filePath)
7 {
8 HttpWebRequest hreq = null as HttpWebRequest;
9 try
10 {
11
12 hreq = (HttpWebRequest)HttpWebRequest.Create(strHref);
13
14 //延迟时间为10秒
15 hreq.Timeout = 10 * 1000;
16 hreq.Method = "GET";
17
18 HttpWebResponse hres = (HttpWebResponse)hreq.GetResponse();
19
20 MemoryStream memoryStream = new MemoryStream();
21 byte[] buffer = new byte[0x100];
22 Stream rs = hres.GetResponseStream();
23
24 for (int i = rs.Read(buffer, 0, buffer.Length); i > 0; i = rs.Read(buffer, 0, buffer.Length))
25 {
26 memoryStream.Write(buffer, 0, i);
27 }
28 rs.Close();
29
30 byte[] bufferWrite = memoryStream.ToArray();
31 memoryStream.Close();
32 memoryStream.Dispose();
33
34 File.WriteAllBytes(filePath, bufferWrite);
35 return true;
36 }
37 catch (Exception ex)
38 {
39 return false;
40 }
41 finally
42 {
43 if (hreq != null)
44 {
45 hreq.Abort();
46 }
47 }
48 return false;
49 }
4.解析下载下来的附件
解析word,首先电脑上得装了office word;然后在引用中找到Microsoft.Office.Interop.Word这个引用
解析word文件转换成txt文本
private string GetTextFWord(string fileName)
{
string txtContent = string.Empty;
try
{
Microsoft.Office.Interop.Word.ApplicationClass wordApp = new Microsoft.Office.Interop.Word.ApplicationClass();
object file = fileName;
object nullobj = System.Reflection.Missing.Value;
Microsoft.Office.Interop.Word.Document doc = wordApp.Documents.Open(
ref file, ref nullobj, ref nullobj,
ref nullobj, ref nullobj, ref nullobj,
ref nullobj, ref nullobj, ref nullobj,
ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj);
txtContent = doc.Content.Text;//这里读取所有的文本
doc.Close(ref nullobj, ref nullobj, ref nullobj);
}
catch
{
return string.Empty;
}
return txtContent;
}
解析pdf ,用到的是第三方插件PDFBox-0.7.3.dll,大家可以去网上搜索
pdf文件转换成文本内容
1 private string GetTextFPDF(string fileName)
2 {
3 string txtContent = string.Empty;
4 try
5 {
6 PDDocument doc = PDDocument.load(fileName);
7 PDFTextStripper pdfStripper = new PDFTextStripper();
8 // 设置换行符
9 pdfStripper.setLineSeparator(Environment.NewLine);
10 txtContent = pdfStripper.getText(doc);
11 doc.close();
12 }
13 catch
14 {
15 return string.Empty; ;
16 }
17 return txtContent;
18 }
解析txt
txt解析成文本内容