游子日月长

笑渐不闻声渐悄,多情却被无情恼!

导航

[重点]delphi 实现 根据给定的标题去《中国青年报》网上电子报数据中查找匹配的内容,并从该内容中取出引题、正题、副题、作者和正文。

项目要求:根据给定的标题去《中国青年报》网上电子报数据中查找匹配的内容,并从该内容中取出引题、正题、作者和正文。


unit Unit1;

interface

uses
  Winapi.Windows, Winapi.Messages, System.SysUtils, System.Variants, System.Classes, Vcl.Graphics,
  Vcl.Controls, Vcl.Forms, Vcl.Dialogs, Vcl.StdCtrls, IdBaseComponent,
  IdComponent, IdTCPConnection, IdTCPClient, IdHTTP, Vcl.ComCtrls;

type
  TForm1 = class(TForm)
    IdHTTP1: TIdHTTP;
    Button1: TButton;
    Label1: TLabel;
    Edit1: TEdit;
    ProgressBar1: TProgressBar;
    Memo1: TMemo;
    Button2: TButton;
    Memo2: TMemo;
    procedure Button1Click(Sender: TObject);
    procedure FormCreate(Sender: TObject);
    procedure Button2Click(Sender: TObject);
  private
    { Private declarations }
  public
    { Public declarations }
  end;

var
  Form1: TForm1;

implementation
       uses StrUtils,HttpApp;
{$R *.dfm}

type
  TDelFlags = set of (dfDelBefore, dfDelAfter);


function Delstr(var ms: String; endstr: String; Flags: TDelFlags;
  bself: Boolean = True): String;
var
  l: Integer;
begin
  l := length(endstr);
  if dfDelBefore in Flags then
  begin
    if bself then
    begin
      Result := copy(ms, 1, pos(endstr, ms) + l - 1);
      Delete(ms, 1, pos(endstr, ms) + l - 1);
    end
    else
    begin
      Result := copy(ms, 1, pos(endstr, ms) - 1);
      Delete(ms, 1, pos(endstr, ms) - 1);
    end;
  end
  else
  begin
    if bself then
    begin
      Result := copy(ms, pos(endstr, ms), length(ms));
      Delete(ms, pos(endstr, ms), length(ms));
    end
    else
    begin
      Result := copy(ms, pos(endstr, ms) + l, length(ms));
      Delete(ms, pos(endstr, ms) + l, length(ms));
    end;
  end;
end;

procedure DelstrEx(var ms: String; endstr: String;
  var DelData: String; Flags: TDelFlags; bself: Boolean = True);
var
  l: Integer;
begin
  l := length(endstr);
  if dfDelBefore in Flags then
  begin           //删除字符串的前半部分
    if bself then //连同自己一起删除
    begin
      DelData := copy(ms, 1, pos(endstr, ms) + l - 1);
      Delete(ms, 1, pos(endstr, ms) + l - 1);
    end
    else
    begin
      DelData := copy(ms, pos(endstr, ms) - 1, length(ms));
      Delete(ms, 1, pos(endstr, ms) - 1);
    end;
  end
  else
  begin
    if bself then
    begin
      DelData := copy(ms, pos(endstr, ms), length(ms));
      Delete(ms, pos(endstr, ms), length(ms)); //连同自己一起删除
    end
    else
    begin
      DelData := copy(ms, pos(endstr, ms) + l, length(ms));
      Delete(ms, pos(endstr, ms) + l, length(ms));
    end;
  end;
end; {DelstrEx}


function GetCenterStr(src, str1, str2: String): String;
var
  i, i2, i3: Integer;
begin
  i := 0;
  i2 := 0;
  i3 := 0;
  Delstr(src, str1, [dfDelBefore]);
  i := pos(AnsiLowercase(str1), AnsiLowercase(src));
  i3 := pos(AnsiLowercase(str2), AnsiLowercase(src));
  Result := copy(src, i2 + 1, i3 - i2 - 1);
end;


function delstrByNum(ss:string;uniqueFlag:string;disapperNum:integer;FromFlags: TDelFlags;bReturnDeletedPart:boolean):string;
var _num:integer;
    _Str:string;
begin
     _num:=0;
     _Str:=ss;

     result:='';

     while _num<disapperNum do
     begin
         if dfDelBefore in FromFlags then   //从字符串左端开始删除
         begin
            delstr(_Str,uniqueFlag,FromFlags);
         end
         else
         begin  //从字符串右端开始删除
           _Str:= StrUtils.ReverseString(_Str) ;

           if bReturnDeletedPart then
              delstrEx(_Str,StrUtils.ReverseString(uniqueFlag),result,[dfdelbefore])
           else
              delstr(_Str,StrUtils.ReverseString(uniqueFlag),[dfdelbefore]);

             _Str:= StrUtils.ReverseString(_Str) ;
         end;

          inc(_num);
     end;

     if result='' then result:=_Str
     else  result:= StrUtils.ReverseString(result) ;
end;




function Matchstrings(Source, pattern: String): Boolean;
var
  pSource: array[0..255] of Char;
  pPattern: array[0..255] of Char;
  function MatchPattern(element, pattern: PChar): Boolean;
    function IsPatternWild(pattern: PChar): Boolean;
    begin
      Result := StrScan(pattern, '*') <> nil;
      if not Result then
        Result := StrScan(pattern, '?') <> nil;
    end;
  begin
    if 0 = StrComp(pattern, '*') then
      Result := True
    else if (element^ = Chr(0)) and (pattern^ <> Chr(0)) then
      Result := False
    else if element^ = Chr(0) then
      Result := True
    else
    begin
      case pattern^ of
        '*':
          if MatchPattern(element, @pattern[1]) then
            Result := True
          else
            Result := MatchPattern(@element[1], pattern);
          '?':
          Result := MatchPattern(@element[1], @pattern[1]);
        else
          if element^ = pattern^ then
            Result := MatchPattern(@element[1], @pattern[1])
          else
            Result := False;
      end;
    end;
  end;
begin
  StrPCopy(pSource, Source);
  StrPCopy(pPattern, pattern);
  Result := MatchPattern(pSource, pPattern);
end; {匹配字符串函数}


{从磁盘中搜索指定类型的所有文件}
procedure FindFiles(ASourceDir, SearchFileType: String; var List: TStrings);
var
  FileRec: TSearchrec;
  Sour, OldFileName, NewFileName: String;
  fs: TFileStream;
begin
  Sour := ASourceDir;
  if Sour[length(Sour)] <> '\' then
    Sour := Sour + '\';
  if FindFirst(Sour + '*.*', faAnyfile, FileRec) = 0 then
    {循环}
    repeat
      if ((FileRec.Attr and faDirectory) <> 0) then
      begin
        if (FileRec.Name <> '.') and (FileRec.Name <> '..') then //找到目录
        begin
          FindFiles(Sour + FileRec.Name, SearchFileType, List);
        end;
      end
      else //找到文件
      begin
        if Matchstrings(LowerCase(FileRec.Name), SearchFileType) then
        begin
          List.Add(Sour + FileRec.Name);
        end; {拷贝所有类型的文件}
      end;
    until FindNext(FileRec) <> 0;
  system.SysUtils.FindClose(FileRec);
end; {从磁盘中搜索指定类型的所有文件}



procedure RmHtmlTags(var src: string);
  function DelTag(var src: string): boolean;
  var
    iPosS, iPosE: integer;
  begin
    result := False;
    if pos('<script', AnsiLowerCase(src)) > 0 then
      begin
        iPosS := pos('<script', AnsiLowerCase(src));
        if iPosS > 0 then
          begin
            iPosE := pos('</script>', AnsiLowerCase(src));
            result := iPosE > iPosS;
            if result then
              Delete(src, iPosS, iPosE - iPosS + 9);
          end;
      end
    else
      begin
        iPosS := pos('<', src);
        if iPosS > 0 then
          begin
            iPosE := pos('>', src);
            result := iPosE > iPosS;
            if result then
              Delete(src, iPosS, iPosE - iPosS + 1);
          end;
      end;
  end;
begin
  //src := LowerCase(src);
  src := src;
  repeat
  until not DelTag(src);
end;

procedure RmHtmlTagsEx(var src: string);
  function DelTag(var src: string): boolean;
  var
    iPosS, iPosE: integer;
  begin
    result := False;
    if pos('<script', AnsiLowerCase(src)) > 0 then
      begin
        iPosS := pos('<script', AnsiLowerCase(src));
        if iPosS > 0 then
          begin
            iPosE := pos('</script>', AnsiLowerCase(src));
            result := iPosE > iPosS;
            if result then
              Delete(src, iPosS, iPosE - iPosS + 9);
          end;
      end
    else
    if pos('<style', AnsiLowerCase(src)) > 0 then
      begin
        iPosS := pos('<style', AnsiLowerCase(src));
        if iPosS > 0 then
          begin
            iPosE := pos('</style>', AnsiLowerCase(src));
            result := iPosE > iPosS;
            if result then
              Delete(src, iPosS, iPosE - iPosS + 9);
          end;
      end
    else
      begin
       { iPosS := pos('<', src);
        if iPosS > 0 then
          begin
            iPosE := pos('>', src);
            result := iPosE > iPosS;
            if result then
              Delete(src, iPosS, iPosE - iPosS + 1);
          end; }
      end;
  end;
begin
  //src := LowerCase(src);
  src := src;
  repeat
  until not DelTag(src);
end;


function UrlDecoder(const AUrl:string):string;
begin
  result:= UTF8Decode(HttpDecode(AUrl));
end;

function UrlEncoder(const AUrl:string):string;
begin
//URL编码通常使用“+”来替换空格。
  result:=HttpEncode(UTF8Encode(AUrl));
end;


function  getResURL(http:TIdHttp;searchWord:string):string;
var info:tstringlist;
   res:tstringstream;
   tURL:string;
  MemoText: string;
begin
   http.HandleRedirects:=true;
   http.Request.UserAgent:='Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; InfoPath.2)';
   http.Request.Host:='search.cyol.com';
   http.Request.ContentType:='application/x-www-form-urlencoded';
   http.Request.Referer:='http://search.cyol.com/index.htm';
   http.request.CacheControl:='no-cache';
   http.HTTPOptions:=http.HTTPOptions+[hoKeepOrigProtocol];

   try
      info:=tstringlist.Create;
     res:=tstringstream.Create('',TEncoding.UTF8);

    {
      info.Add('op=new');
     info.Add('searchBtn=搜索');
     info.Add('searchText='+searchWord); //全站内模糊搜索
     // info.Add('searchText=一日为师 终身挨骂?');
    }
     info.Add('ak=');
     info.Add('ck=');
     info.Add('df=');
     info.Add('dt=');
     info.Add('nk=4');
     info.Add('od=date');
     info.Add('op=adv');
     info.Add('tk='+searchWord);

     tURL:='http://search.cyol.com/searchh.jsp';
     http.Post(tURL,info,res);
     MemoText:= res.DataString;

     delstr(MemoText,'resultdiv',[dfdelbefore]);

     //showmessage(MemoText);

     if pos('color:red',ansilowercase(MemoText))=0 then
     begin
          result:='';
          Exit;
     end;


     delstr(MemoText,'>',[dfdelbefore]);
     delstr(MemoText,'<a',[dfdelbefore]);
     delstr(MemoText,'http:',[dfdelbefore],false);
     delstr(MemoText,'.htm',[dfdelafter],false);


     result:=MemoText;


   finally
      freeandnil(info);
      freeandnil(res);
      //http.Free;
   end;
end;

function getHtmlStr(http:TIdHttp;fURL:string):string;
begin
   if assigned(http) and (http is TIdHttp) and (http<>nil) then
    result:=  http.Get(fURL);
end;



procedure TForm1.Button1Click(Sender: TObject);

var htmlText:string;
  biaoti: string;
  Author: string;
  yinti: string;
  table_Pos: Integer;
  ss: string;
  outdata: string;
  neirong: string;
  zhenwen: string;
  frontPart: string;
  subtitle: string;
  txtList: TStrings;
  i: Integer;
  readtxt: TStringList;
  zhenti: string;
  resURL: string;

begin
   button1.Caption:='正在处理'; button1.Enabled:=false;

 { htmlText:=  getHtmlStr(idHTTP1, getResURL(idHTTP1,'一日为师 终身挨骂?') );

  frontPart:=htmlText;

  delstr(frontPart,'<!--enpproperty',[dfdelbefore]);
  delstr(frontPart,'/enpproperty',[dfdelafter]);

  Author:=  GetCenterStr(frontPart,'<author>','</author>');    //作者
  subtitle:=  GetCenterStr(frontPart,'<subtitle>','</subtitle>'); //副题
  yinti:=  GetCenterStr(frontPart,'<introtitle>','</introtitle>');  //引题


  //取正文
  zhenwen:=htmlText;
  delstr(zhenwen,'<!--enpcontent-->',[dfdelbefore]);
  delstr(zhenwen,'<!--/enpcontent-->',[dfdelafter]);
  Memo1.Text:=zhenwen;

  }

  if not directoryExists(edit1.Text) then
  begin

     showmessage('请输入标引txt的路径!');
    exit;
  end;


  txtList:=tstringlist.Create ;
  readtxt:=TStringlist.Create ;
  findfiles(edit1.Text,'*.txt',txtList);

  ProgressBar1.Position:=0;
  ProgressBar1.Max:=txtlist.Count;



  try

  for i := 0 to txtList.Count-1 do
  begin
       application.ProcessMessages ;
       ProgressBar1.Position:=i+1;

       readtxt.LoadFromFile(txtList[i]);

        zhenti:=readtxt.Values['<主题>'];

        htmlText:='';  zhenwen:='';
        author:='';subtitle:=''; yinti:='';


        resURL:=getResURL(idHTTP1,trim(zhenti));

        if ''<>trim(resURL) then
        begin

            htmlText:=  getHtmlStr(idHTTP1,  resURL);

            frontPart:=htmlText;

            delstr(frontPart,'<!--enpproperty',[dfdelbefore]);
            delstr(frontPart,'/enpproperty',[dfdelafter]);

            Author:=  GetCenterStr(frontPart,'<author>','</author>');    //作者
            subtitle:=  GetCenterStr(frontPart,'<subtitle>','</subtitle>'); //副题
            yinti:=  GetCenterStr(frontPart,'<introtitle>','</introtitle>');  //引题

            //取正文
            zhenwen:=htmlText;
            delstr(zhenwen,'<!--enpcontent-->',[dfdelbefore]);
            delstr(zhenwen,'<!--/enpcontent-->',[dfdelafter]);

            RmHtmlTagsEx(zhenwen);

            if ''<>trim(yinti) then readtxt.Values['<引题>']:=yinti;
            if ''<>trim(subtitle) then readtxt.Values['<副题>']:=subtitle;
            if ''<>trim(author) then readtxt.Values['<作者>']:=author;
            if ''<>trim(zhenwen) then readtxt.Values['<正文>']:=slinebreak+trim(zhenwen);

            readtxt.SaveToFile(txtList[i]);

            readtxt.Clear ;
        end
        else
        begin
             Memo2.Lines.Add('未找到对应数据:'+txtList[i]);
        end;

  end; // for i end

  if ProgressBar1.Max=ProgressBar1.Position then
  begin
      showmessage('处理完成!');
  end;
  finally
     button1.Caption:='开始处理'; button1.Enabled:=true;
      freeandnil(readtxt);
      freeandnil(txtlist);
  end;










{  delstr(htmlText,'<body',[dfdelbefore]);
  biaoti:='biaoti';
  //取作者
  Author:=htmlText;
  delstr(Author,biaoti,[dfdelbefore]);
  delstr(Author,'rc-writer',[dfdelbefore]);
  delstr(Author,'>',[dfdelbefore]);
  delstr(Author,'<',[dfdelafter]);

  showmessage(Author);

  //取引题
  yinti:=htmlText;
  delstr(yinti,biaoti,[dfdelafter]);
  table_Pos:=0;
 //example:   ss:='<table>ccc</table><table>ddd</table>';
   yinti:=delstrByNum(yinti,'<table',1,[dfdelafter],true)+'>';
   RmHtmlTags(yinti);
   showmessage(yinti );

 //取正文内容
 neirong:='neirong';
 zhenwen:=htmlText;
 delstr(zhenwen,neirong,[dfdelbefore]);
 delstr(zhenwen,'<P',[dfdelbefore],false);
 delstr(zhenwen,'<script',[dfdelafter]);
 Memo1.Text:=zhenwen;
 }











end;

procedure TForm1.Button2Click(Sender: TObject);
var
  ss: string;
begin
   ss:=Memo1.Text;
   RmHtmlTagsEx(ss);
   memo1.Text:=ss;
end;

procedure TForm1.FormCreate(Sender: TObject);
begin
edit1.Clear ;
memo2.Clear ;
end;

end.

 

 

 

posted on 2017-03-03 08:44  游子日月长  阅读(273)  评论(0编辑  收藏  举报