代码改变世界

C#读取HTML文件内容写入记事本

2011-09-29 10:41  coderidea  阅读(5856)  评论(0编辑  收藏  举报

C#读取HTML文件内容写入记事本


try
{
int totalFile = 0;
//string dirPath = @"E:\chfuMetarnet\BSC6810 alarm\";
if (this.textBox1.Text.Trim() == "")
{
MessageBox.Show("请输入HTML文件路径!");
}
else
{
string dirPath = this.textBox1.Text.Trim();
if (!dirPath.Substring(dirPath.Length - 1).Contains("\\"))
{
dirPath = dirPath+"\\";
}
StreamWriter sw;
DirectoryInfo dirInfo = new DirectoryInfo(dirPath);
FileInfo[] files = dirInfo.GetFiles();
string filename = dirPath + "告警经验库信息.txt";
if (File.Exists(filename))
{
sw = File.AppendText(filename);
}
else
{
sw = File.CreateText(filename);
}
foreach (FileInfo fileinfo in files)
{
if (fileinfo.Extension.Equals(".htm"))//遍历所有htm文件
{
totalFile = totalFile + 1;
WebRequest myWebRequest = WebRequest.Create(dirPath + fileinfo.Name);
WebResponse myWebResponse = myWebRequest.GetResponse();
Stream myStream = myWebResponse.GetResponseStream();
Encoding encode = System.Text.Encoding.GetEncoding("gb2312");
StreamReader myStreamReader = new StreamReader(myStream, encode);
string strhtml = myStreamReader.ReadToEnd();
myWebResponse.Close();
string stroutput = strhtml;
Regex regex = new Regex(@"<[^>]+>|</[^>]+>");//去掉HTML标记的正则表达式
string tmpStr = "<h4>([^<]*)</h4>"; //获取<h4>之间内容的表达式
Match TitleMatch = Regex.Match(strhtml, tmpStr, RegexOptions.IgnoreCase | RegexOptions.Multiline);
string causename = TitleMatch.Value.ToString();//包含<h4>和</h4>标记
causename = Regex.Replace(causename, "[\n|\r|\t]", " ");//去掉换行和TAB键符号
causename = causename.Trim();
string cause = causename.Substring(4, causename.Length - 9);//得到告警原因
string titleStr = "<title>([^<]*)</title>";
TitleMatch = Regex.Match(strhtml, titleStr, RegexOptions.IgnoreCase | RegexOptions.Multiline);
string titlename = TitleMatch.Value.ToString();
titlename = Regex.Replace(titlename, "[\n|\r|\t]", "");//去掉换行和TAB键符号
titlename = titlename.Trim();
string regexStr = "<ul><li>(?<key>.*?)</ul>";//获取<ul><li>后边的内容,直到</ul>结尾
Regex r = new Regex(regexStr, RegexOptions.None);
strhtml = Regex.Replace(strhtml, "[\n|\r|\t]", "");//去掉换行和TAB键符号
Match mc = r.Match(strhtml);
string dataStr = mc.Groups["key"].Value;
dataStr = "<ul><li>" + dataStr + "</ul>";//得到完整的<ul></ul>之间的源码
strhtml = strhtml.Replace(dataStr, "");//将去掉换行符和tab键的源码中去除<ul></ul>部分源码
strhtml = strhtml.Replace(titlename, "");//去掉<title></title>
strhtml = regex.Replace(strhtml, " ");//过滤掉HTML标记
strhtml = strhtml.Replace("&nbsp;", "");//去掉空格字符
string[] arr = cause.Split(' ');
string zhCause = arr[arr.Length - 1];//获取数组最后一个元素:告警原因
sw.WriteLine("" + totalFile + "个文件:" + fileinfo.Name);
sw.WriteLine("-----告警原因------:");
//sw.WriteLine(cause);// ALM-1 网元启动
zhCause=this.chinaString(zhCause);
sw.WriteLine(zhCause);//网元启动
sw.WriteLine("-----处理经验------:");
sw.WriteLine(strhtml);
sw.WriteLine();
sw.Flush();

}
}
sw.Close();
MessageBox.Show("操作成功!", "提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
}
}
catch (Exception ee)
{
MessageBox.Show("操作失败:" + ee.Message);
}