删除HTML标记

#ifndef REQUIRE_H
#define REQUIRE_H
#include <cstdio>
#include <cstdlib>
#include <fstream>


inline void require(bool requirement,const char* msg="Requirement failed")
{
//为旧式编译器提供的局部语句"using namespace std":
using namespace std;
if(!requirement)
{
fputs(msg,stderr);
fputs("\n",stderr);
exit(EXIT_FAILURE);
}
}


inline void requireArgs(int argc,int args,const char* msg="Must use %d arguments")
{
using namespace std;
if(argc!=args+1)
{
fprintf(stderr,msg,args);
fputs("\n",stderr);
exit(EXIT_FAILURE);
}
}


inline void requireMinArgs(int argc,int minArgs,const char* msg="Must use at lease %d arguments")
{
using namespace std;
if(argc<minArgs+1)
{
fprintf(stderr,msg,minArgs);
fputs("\n",stderr);
exit(EXIT_FAILURE);
}
}
//三个重载的assure函数,分别用于ifstream,ofstream,fstream文件流的存在合法性检测
inline void assure(std::ifstream& in,const char* filename="")
{
using namespace std;
if(!in)
{
fprintf(stderr,"Could not open file%s\n",filename);
exit(EXIT_FAILURE);
}
}


inline void assure(std::ofstream& in,const char* filename="")
{
using namespace std;
if(!in)
{
fprintf(stderr,"Could not open file%s\n",filename);
exit(EXIT_FAILURE);
}
}


inline void assure(std::fstream& in,const char* filename="")
{
using namespace std;
if(!in)
{
fprintf(stderr,"Could not open file%s\n",filename);
exit(EXIT_FAILURE);
}
}

#endif

#ifndef REPLACEALL_H
#define REPLACEALL_H
#include <string>
using std::string;
std::string& replaceAll(std::string& context,const std::string& from,const std::string& to)
{
size_t lookHere=0;
size_t foundHere;
while((foundHere=context.find(from,lookHere))!=string::npos)
{
context.replace(foundHere,from.size(),to);
lookHere=foundHere+to.size();
}
return context;
}
#endif

#include <cstddef>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <string>
#include "ReplaceAll.h"
#include "require.h"
using namespace std;


string& stripHTMLTags(string& s) throw(runtime_error)//-->warning
{
size_t leftPos;
while((leftPos=s.find('<'))!=string::npos)
{
size_t rightPos=s.find('>',leftPos+1);
if(rightPos==string::npos)
{
ostringstream msg;
msg<<"Incomplete HTML tag starting in position "
<<leftPos;
throw runtime_error(msg.str());
}
s.erase(leftPos,rightPos-leftPos+1);
}
//移除所有的特殊HTML字符
replaceAll(s,"&lt","<");
replaceAll(s,"&gt",">");
replaceAll(s,"&amp","&");
replaceAll(s,"&nbsp"," ");
//Etc...
return s;
}
int main(int argc,char* argv[1])
{
requireArgs(argc,1,"usage: HTMLStripper2 InputFile");//这种方法应该在命令行(进入当前可执行文件目录后)中输入删除HTML标记.exe test1.html
ifstream in(argv[1]);
assure(in,argv[1]);
//读取整个文件到字符串,然后分割字符串
ostringstream ss;
ss<<in.rdbuf();
try
{
string s=ss.str();
cout<<stripHTMLTags(s)<<endl;
return EXIT_SUCCESS;
}
catch(runtime_error& x)
{
cout<<x.what()<<endl;
return EXIT_FAILURE;
}
system("pause");
return 0;
}


posted @ 2012-11-16 08:58  yuanmus  阅读(149)  评论(0编辑  收藏  举报