[重点]delphi 实现 根据给定的标题去《中国青年报》网上电子报数据中查找匹配的内容,并从该内容中取出引题、正题、副题、作者和正文。
项目要求:根据给定的标题去《中国青年报》网上电子报数据中查找匹配的内容,并从该内容中取出引题、正题、作者和正文。
unit Unit1; interface uses Winapi.Windows, Winapi.Messages, System.SysUtils, System.Variants, System.Classes, Vcl.Graphics, Vcl.Controls, Vcl.Forms, Vcl.Dialogs, Vcl.StdCtrls, IdBaseComponent, IdComponent, IdTCPConnection, IdTCPClient, IdHTTP, Vcl.ComCtrls; type TForm1 = class(TForm) IdHTTP1: TIdHTTP; Button1: TButton; Label1: TLabel; Edit1: TEdit; ProgressBar1: TProgressBar; Memo1: TMemo; Button2: TButton; Memo2: TMemo; procedure Button1Click(Sender: TObject); procedure FormCreate(Sender: TObject); procedure Button2Click(Sender: TObject); private { Private declarations } public { Public declarations } end; var Form1: TForm1; implementation uses StrUtils,HttpApp; {$R *.dfm} type TDelFlags = set of (dfDelBefore, dfDelAfter); function Delstr(var ms: String; endstr: String; Flags: TDelFlags; bself: Boolean = True): String; var l: Integer; begin l := length(endstr); if dfDelBefore in Flags then begin if bself then begin Result := copy(ms, 1, pos(endstr, ms) + l - 1); Delete(ms, 1, pos(endstr, ms) + l - 1); end else begin Result := copy(ms, 1, pos(endstr, ms) - 1); Delete(ms, 1, pos(endstr, ms) - 1); end; end else begin if bself then begin Result := copy(ms, pos(endstr, ms), length(ms)); Delete(ms, pos(endstr, ms), length(ms)); end else begin Result := copy(ms, pos(endstr, ms) + l, length(ms)); Delete(ms, pos(endstr, ms) + l, length(ms)); end; end; end; procedure DelstrEx(var ms: String; endstr: String; var DelData: String; Flags: TDelFlags; bself: Boolean = True); var l: Integer; begin l := length(endstr); if dfDelBefore in Flags then begin //删除字符串的前半部分 if bself then //连同自己一起删除 begin DelData := copy(ms, 1, pos(endstr, ms) + l - 1); Delete(ms, 1, pos(endstr, ms) + l - 1); end else begin DelData := copy(ms, pos(endstr, ms) - 1, length(ms)); Delete(ms, 1, pos(endstr, ms) - 1); end; end else begin if bself then begin DelData := copy(ms, pos(endstr, ms), length(ms)); Delete(ms, pos(endstr, ms), length(ms)); //连同自己一起删除 end else begin DelData := copy(ms, pos(endstr, ms) + l, length(ms)); Delete(ms, pos(endstr, ms) + l, length(ms)); end; end; end; {DelstrEx} function GetCenterStr(src, str1, str2: String): String; var i, i2, i3: Integer; begin i := 0; i2 := 0; i3 := 0; Delstr(src, str1, [dfDelBefore]); i := pos(AnsiLowercase(str1), AnsiLowercase(src)); i3 := pos(AnsiLowercase(str2), AnsiLowercase(src)); Result := copy(src, i2 + 1, i3 - i2 - 1); end; function delstrByNum(ss:string;uniqueFlag:string;disapperNum:integer;FromFlags: TDelFlags;bReturnDeletedPart:boolean):string; var _num:integer; _Str:string; begin _num:=0; _Str:=ss; result:=''; while _num<disapperNum do begin if dfDelBefore in FromFlags then //从字符串左端开始删除 begin delstr(_Str,uniqueFlag,FromFlags); end else begin //从字符串右端开始删除 _Str:= StrUtils.ReverseString(_Str) ; if bReturnDeletedPart then delstrEx(_Str,StrUtils.ReverseString(uniqueFlag),result,[dfdelbefore]) else delstr(_Str,StrUtils.ReverseString(uniqueFlag),[dfdelbefore]); _Str:= StrUtils.ReverseString(_Str) ; end; inc(_num); end; if result='' then result:=_Str else result:= StrUtils.ReverseString(result) ; end; function Matchstrings(Source, pattern: String): Boolean; var pSource: array[0..255] of Char; pPattern: array[0..255] of Char; function MatchPattern(element, pattern: PChar): Boolean; function IsPatternWild(pattern: PChar): Boolean; begin Result := StrScan(pattern, '*') <> nil; if not Result then Result := StrScan(pattern, '?') <> nil; end; begin if 0 = StrComp(pattern, '*') then Result := True else if (element^ = Chr(0)) and (pattern^ <> Chr(0)) then Result := False else if element^ = Chr(0) then Result := True else begin case pattern^ of '*': if MatchPattern(element, @pattern[1]) then Result := True else Result := MatchPattern(@element[1], pattern); '?': Result := MatchPattern(@element[1], @pattern[1]); else if element^ = pattern^ then Result := MatchPattern(@element[1], @pattern[1]) else Result := False; end; end; end; begin StrPCopy(pSource, Source); StrPCopy(pPattern, pattern); Result := MatchPattern(pSource, pPattern); end; {匹配字符串函数} {从磁盘中搜索指定类型的所有文件} procedure FindFiles(ASourceDir, SearchFileType: String; var List: TStrings); var FileRec: TSearchrec; Sour, OldFileName, NewFileName: String; fs: TFileStream; begin Sour := ASourceDir; if Sour[length(Sour)] <> '\' then Sour := Sour + '\'; if FindFirst(Sour + '*.*', faAnyfile, FileRec) = 0 then {循环} repeat if ((FileRec.Attr and faDirectory) <> 0) then begin if (FileRec.Name <> '.') and (FileRec.Name <> '..') then //找到目录 begin FindFiles(Sour + FileRec.Name, SearchFileType, List); end; end else //找到文件 begin if Matchstrings(LowerCase(FileRec.Name), SearchFileType) then begin List.Add(Sour + FileRec.Name); end; {拷贝所有类型的文件} end; until FindNext(FileRec) <> 0; system.SysUtils.FindClose(FileRec); end; {从磁盘中搜索指定类型的所有文件} procedure RmHtmlTags(var src: string); function DelTag(var src: string): boolean; var iPosS, iPosE: integer; begin result := False; if pos('<script', AnsiLowerCase(src)) > 0 then begin iPosS := pos('<script', AnsiLowerCase(src)); if iPosS > 0 then begin iPosE := pos('</script>', AnsiLowerCase(src)); result := iPosE > iPosS; if result then Delete(src, iPosS, iPosE - iPosS + 9); end; end else begin iPosS := pos('<', src); if iPosS > 0 then begin iPosE := pos('>', src); result := iPosE > iPosS; if result then Delete(src, iPosS, iPosE - iPosS + 1); end; end; end; begin //src := LowerCase(src); src := src; repeat until not DelTag(src); end; procedure RmHtmlTagsEx(var src: string); function DelTag(var src: string): boolean; var iPosS, iPosE: integer; begin result := False; if pos('<script', AnsiLowerCase(src)) > 0 then begin iPosS := pos('<script', AnsiLowerCase(src)); if iPosS > 0 then begin iPosE := pos('</script>', AnsiLowerCase(src)); result := iPosE > iPosS; if result then Delete(src, iPosS, iPosE - iPosS + 9); end; end else if pos('<style', AnsiLowerCase(src)) > 0 then begin iPosS := pos('<style', AnsiLowerCase(src)); if iPosS > 0 then begin iPosE := pos('</style>', AnsiLowerCase(src)); result := iPosE > iPosS; if result then Delete(src, iPosS, iPosE - iPosS + 9); end; end else begin { iPosS := pos('<', src); if iPosS > 0 then begin iPosE := pos('>', src); result := iPosE > iPosS; if result then Delete(src, iPosS, iPosE - iPosS + 1); end; } end; end; begin //src := LowerCase(src); src := src; repeat until not DelTag(src); end; function UrlDecoder(const AUrl:string):string; begin result:= UTF8Decode(HttpDecode(AUrl)); end; function UrlEncoder(const AUrl:string):string; begin //URL编码通常使用“+”来替换空格。 result:=HttpEncode(UTF8Encode(AUrl)); end; function getResURL(http:TIdHttp;searchWord:string):string; var info:tstringlist; res:tstringstream; tURL:string; MemoText: string; begin http.HandleRedirects:=true; http.Request.UserAgent:='Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; InfoPath.2)'; http.Request.Host:='search.cyol.com'; http.Request.ContentType:='application/x-www-form-urlencoded'; http.Request.Referer:='http://search.cyol.com/index.htm'; http.request.CacheControl:='no-cache'; http.HTTPOptions:=http.HTTPOptions+[hoKeepOrigProtocol]; try info:=tstringlist.Create; res:=tstringstream.Create('',TEncoding.UTF8); { info.Add('op=new'); info.Add('searchBtn=搜索'); info.Add('searchText='+searchWord); //全站内模糊搜索 // info.Add('searchText=一日为师 终身挨骂?'); } info.Add('ak='); info.Add('ck='); info.Add('df='); info.Add('dt='); info.Add('nk=4'); info.Add('od=date'); info.Add('op=adv'); info.Add('tk='+searchWord); tURL:='http://search.cyol.com/searchh.jsp'; http.Post(tURL,info,res); MemoText:= res.DataString; delstr(MemoText,'resultdiv',[dfdelbefore]); //showmessage(MemoText); if pos('color:red',ansilowercase(MemoText))=0 then begin result:=''; Exit; end; delstr(MemoText,'>',[dfdelbefore]); delstr(MemoText,'<a',[dfdelbefore]); delstr(MemoText,'http:',[dfdelbefore],false); delstr(MemoText,'.htm',[dfdelafter],false); result:=MemoText; finally freeandnil(info); freeandnil(res); //http.Free; end; end; function getHtmlStr(http:TIdHttp;fURL:string):string; begin if assigned(http) and (http is TIdHttp) and (http<>nil) then result:= http.Get(fURL); end; procedure TForm1.Button1Click(Sender: TObject); var htmlText:string; biaoti: string; Author: string; yinti: string; table_Pos: Integer; ss: string; outdata: string; neirong: string; zhenwen: string; frontPart: string; subtitle: string; txtList: TStrings; i: Integer; readtxt: TStringList; zhenti: string; resURL: string; begin button1.Caption:='正在处理'; button1.Enabled:=false; { htmlText:= getHtmlStr(idHTTP1, getResURL(idHTTP1,'一日为师 终身挨骂?') ); frontPart:=htmlText; delstr(frontPart,'<!--enpproperty',[dfdelbefore]); delstr(frontPart,'/enpproperty',[dfdelafter]); Author:= GetCenterStr(frontPart,'<author>','</author>'); //作者 subtitle:= GetCenterStr(frontPart,'<subtitle>','</subtitle>'); //副题 yinti:= GetCenterStr(frontPart,'<introtitle>','</introtitle>'); //引题 //取正文 zhenwen:=htmlText; delstr(zhenwen,'<!--enpcontent-->',[dfdelbefore]); delstr(zhenwen,'<!--/enpcontent-->',[dfdelafter]); Memo1.Text:=zhenwen; } if not directoryExists(edit1.Text) then begin showmessage('请输入标引txt的路径!'); exit; end; txtList:=tstringlist.Create ; readtxt:=TStringlist.Create ; findfiles(edit1.Text,'*.txt',txtList); ProgressBar1.Position:=0; ProgressBar1.Max:=txtlist.Count; try for i := 0 to txtList.Count-1 do begin application.ProcessMessages ; ProgressBar1.Position:=i+1; readtxt.LoadFromFile(txtList[i]); zhenti:=readtxt.Values['<主题>']; htmlText:=''; zhenwen:=''; author:='';subtitle:=''; yinti:=''; resURL:=getResURL(idHTTP1,trim(zhenti)); if ''<>trim(resURL) then begin htmlText:= getHtmlStr(idHTTP1, resURL); frontPart:=htmlText; delstr(frontPart,'<!--enpproperty',[dfdelbefore]); delstr(frontPart,'/enpproperty',[dfdelafter]); Author:= GetCenterStr(frontPart,'<author>','</author>'); //作者 subtitle:= GetCenterStr(frontPart,'<subtitle>','</subtitle>'); //副题 yinti:= GetCenterStr(frontPart,'<introtitle>','</introtitle>'); //引题 //取正文 zhenwen:=htmlText; delstr(zhenwen,'<!--enpcontent-->',[dfdelbefore]); delstr(zhenwen,'<!--/enpcontent-->',[dfdelafter]); RmHtmlTagsEx(zhenwen); if ''<>trim(yinti) then readtxt.Values['<引题>']:=yinti; if ''<>trim(subtitle) then readtxt.Values['<副题>']:=subtitle; if ''<>trim(author) then readtxt.Values['<作者>']:=author; if ''<>trim(zhenwen) then readtxt.Values['<正文>']:=slinebreak+trim(zhenwen); readtxt.SaveToFile(txtList[i]); readtxt.Clear ; end else begin Memo2.Lines.Add('未找到对应数据:'+txtList[i]); end; end; // for i end if ProgressBar1.Max=ProgressBar1.Position then begin showmessage('处理完成!'); end; finally button1.Caption:='开始处理'; button1.Enabled:=true; freeandnil(readtxt); freeandnil(txtlist); end; { delstr(htmlText,'<body',[dfdelbefore]); biaoti:='biaoti'; //取作者 Author:=htmlText; delstr(Author,biaoti,[dfdelbefore]); delstr(Author,'rc-writer',[dfdelbefore]); delstr(Author,'>',[dfdelbefore]); delstr(Author,'<',[dfdelafter]); showmessage(Author); //取引题 yinti:=htmlText; delstr(yinti,biaoti,[dfdelafter]); table_Pos:=0; //example: ss:='<table>ccc</table><table>ddd</table>'; yinti:=delstrByNum(yinti,'<table',1,[dfdelafter],true)+'>'; RmHtmlTags(yinti); showmessage(yinti ); //取正文内容 neirong:='neirong'; zhenwen:=htmlText; delstr(zhenwen,neirong,[dfdelbefore]); delstr(zhenwen,'<P',[dfdelbefore],false); delstr(zhenwen,'<script',[dfdelafter]); Memo1.Text:=zhenwen; } end; procedure TForm1.Button2Click(Sender: TObject); var ss: string; begin ss:=Memo1.Text; RmHtmlTagsEx(ss); memo1.Text:=ss; end; procedure TForm1.FormCreate(Sender: TObject); begin edit1.Clear ; memo2.Clear ; end; end.