一个小型浏览器架构:HTTP通讯、COOKIE处理、HTML解析、JS模拟、表单处理

前段时间作了一个HTML的解析类,方便在蜘蛛、信息发布、小偷程序中正确抓取网页内容。

 


有人可能会说,为啥不用Webbrowser呢。

1,首先是效率问题,Webbrowser太慢了。

2,你无法操作Webbrowser上传一个文件。

3,你无法操作Webbrowser跨域的Iframe

4,你不能不按套路出牌,例如页面setTimeout 100秒输出一个div,你也只能100秒后才能获取到。

5,如果你只想在后台解析HTML,不想让用户看到浏览器,Webbrowser会很碍事。

6,阻止浏览器下载图片、CSS、Flash不是一件容易的事情,而这写内容通常没啥用。

。。。。


因为是一个商业软件,不太方便直接公开全部源代码,列出各头大家了解一下吧。如果大家比较感兴趣,稍后我会核心部分做一个说明。


HTTP通讯部分

type

  ETCPSocket = class(Exception);
  ESSLTCPSocket = class(Exception);
  ECookies = class(Exception);
  EHTTP = class(Exception);

  TSockEvent = (sSend,sRecv);
  TSocketNotify = procedure(Event:TSockEvent; Bytes: Int64) of object;

  TTCPSocket = class
  strict private
    FTimeOut: DWORD;
    FSocketNotify: TSocketNotify;
    procedure SetTimeOut(Ms : DWORD);
    function GetTimeOut : DWORD;
  protected
    FSocket: TSocket;
  public
    constructor Create;
    destructor Destroy; override;
    function Open(const Host : String; const Port : WORD) : Boolean;
    function Send(const Buffer : RawByteString) : Boolean;
    function Recv(var Buffer : RawByteString) : Boolean;
    procedure Close; virtual;
    function HostToIP(const Name:String):String;
    property TimeOut : DWORD read GetTimeOut write SetTimeOut;
    property OnNotify: TSocketNotify read FSocketNotify write FSocketNotify;
  end;

  TSSLTCPSocket = class(TTCPSocket)
  strict private
    FCTXClient: PSSL_CTX;
    FSSLClient: PSSL;
  protected
    { protected declarations }
  public
    constructor Create;
    function Open(const Host : String; const Port : WORD) : Boolean;
    function Send(const Buffer : RawByteString) : Boolean;
    function Recv(var Buffer : RawByteString) : Boolean;
    procedure Close; override;
  end;

  TCookieData = class
    Domain,
    Path,
    Name,
    Value:String;
  end;

  TCookies = class
  private
    FList:TObjectList;
    function GetCount:Integer;
    function GetItems(Index: Integer):TCookieData;
    procedure SetItems(Index: Integer; CookieData:TCookieData);
    function InDomain(const Domain,Domain2:String):Boolean;
    function InPath(const Path,Path2:String):Boolean;
    procedure UpdateCookies(const CookieDomain,CookiePath,CookieName,CookieValue:String);
  protected
    { protected declarations }
  public
    constructor Create;
    destructor Destroy; override;
    procedure Clear;
    procedure Add(Domain,Path,Name,Value:String); overload;
    procedure Add(Domain,Path,Cookie:String); overload;
    procedure Delete(Index:Integer);
    property Count:Integer read GetCount;
    property Items[Index: Integer]:TCookieData read GetItems write SetItems; default;
    procedure Update(const Domain,Path,Header:String);
    function GetCookies(const Domain,Path:String): String;
    procedure SetCookies(Domain,Path,Cookies:String);
  end;

  THTTPProtocol = (hpHTTP, hpHTTPS);
  THTTPCommand = (hcGet, hcPost, hcHead, hcDownload);

  TLocation = record
    Protocol: THTTPProtocol;
    Domain: String;
    Path: String;
    Port: Integer;
  end;

  THTTPRequest = record
    Url:String;
    CharSet:String;
    Referer:String;
    CMD:THTTPCommand;
    Buffer:RawByteString;
    PostData:RawByteString;
    AddonsHeader:String;
    ContentType:RawByteString;
    StartTime:Cardinal;
    FinishTime:Cardinal;
  end;

  THTTPResponse = record
    Url:String;
    Header:RawByteString;
    RawCode:RawByteString;
    RedirectCount:Integer;
    RedirectPages: Array of RawByteString;
  end;

  THTTPOnRequest = function(Sender:TObject):Boolean of object;
  THTTPOnResponse = function(Sender:TObject):Boolean of object;
  THTTPOnProgress = function(Sender:TObject; ContentSize,Received:Integer):Boolean of object;

  THTTPManage = class;

  THTTP = class
  private
    FOwner:THTTPManage;
    FCookies:TCookies;
    FFreeCookies:Boolean;
    FTCPSocket:TTCPSocket;
    FSSLTCPSocket:TSSLTCPSocket;
    FFileHandle:THandle;
    FAgentHost:String;
    FAgentPort:Integer;
    FMaxRedirect:Integer;
    FMaxRecvByte:Int64;
    FInterval:Cardinal;
    FTimeOut:Cardinal;
    FDownloadTimeOut:Cardinal;
    FSupportSSL:Boolean;
    FOnNotify: TSocketNotify;
    FOnRequest: THTTPOnRequest;
    FOnResponse: THTTPOnResponse;
    FOnProgress: THTTPOnProgress;
    procedure SetTimeOut(val: DWORD);
    function GetCookie : String;
    procedure SetCookie(val : String);
    procedure SetNotify(const Value: TSocketNotify);
  protected
    function SendRequest:Boolean;
    procedure MakeRequestBuffer;
    function ParseUrl(const URL:String):Boolean;

    property TCPSocket:TTCPSocket read FTCPSocket;
    property SSLSocket: TSSLTCPSocket read FSSLTCPSocket;
  public
    Location:TLocation;
    Request:THTTPRequest;
    Response:THTTPResponse;
    constructor Create(const SupportSSL:Boolean=True; const ACookies:TCookies=nil; const AOwner:THTTPManage=nil);
    destructor Destroy; override;
    procedure Init;
    function Head(Url:string):Integer;
    function Get(Url:string):Integer;  overload;
    function Get(Url:string; GetField:array of String):Integer;  overload;
    function Post(Url:string; PostData:RawByteString):Integer; overload;
    function Post(Url:string; PostField:array of String):Integer; overload;
    function Post(Url:string; PostField:array of String; FileIndex: array of String; Multipart:Boolean):Integer; overload;
    function Redirect(CMD: THTTPCommand; Url:String; PostData:RawByteString; ContentType:RawByteString):Integer;
    function DoRequest(CMD:THTTPCommand; Url:String; PostData:RawByteString; ContentType:RawByteString):Integer;
    function Download(const Url: string; const SaveTo: string):Integer;
    function Encoding(Text:String; HTTPConvert:Boolean):RawByteString;
    property Cookie : String read GetCookie write SetCookie;
    property Cookies : TCookies read FCookies write FCookies;
    property Interval: Cardinal read FInterval write FInterval;
    property AgentHost : String read FAgentHost write FAgentHost;
    property AgentPort : Integer read FAgentPort write FAgentPort;
    property TimeOut : Cardinal read FTimeOut write SetTimeOut;
    property DownloadTimeOut : Cardinal read FDownloadTimeOut write FDownloadTimeOut;
    property MaxRedirect : Integer read FMaxRedirect write FMaxRedirect;
    property MaxRecvByte : Int64 read FMaxRecvByte write FMaxRecvByte;
    property OnNotify: TSocketNotify read FOnNotify write SetNotify;
    property OnRequest : THTTPOnRequest read FOnRequest write FOnRequest;
    property OnResponse : THTTPOnResponse read FOnResponse write FOnResponse;
    property OnProgress : THTTPOnProgress read FOnProgress write FOnProgress;
  end;

  THTTPManage = class
  private
    FCri:TRTLCriticalSection;
    FSendBytes,
    FRecvBytes,
    FLastSendBytes,
    FLastRecvBytes,
    FSeekSend,
    FSeekRecv:Int64;
    FSupportSSL:Boolean;
    FSendAverage:Array [0..AverageCycle-1] Of Int64;
    FRecvAverage:Array [0..AverageCycle-1] Of Int64;
    //Config
    FTimeOut:Cardinal;
    FAgentHost:String;
    FAgentPort,
    FDownloadTimeOut,
    FInterval,
    FMaxRedirect,
    FMaxRecvByte :Integer;

    function GetAgentHost:String;
    procedure SetAgentHost(val:String);
    procedure SetAgentPort(val:Integer);
    procedure SetTimeOut(val:Cardinal);
    procedure SetDownloadTimeOut(val:Integer);
    procedure SetInterval(val:Integer);
    procedure SetMaxRedirect(val:Integer);
    procedure SetMaxRecvByte(val:Integer);
  protected
    FHTTPList:TList;
    procedure Lock;
    procedure UnLock;
  public
    constructor Create(const SupportSSL:Boolean=True);
    destructor Destroy; override;
    function CreateHTTP(const Cookies:TCookies=nil):THTTP;
    procedure FreeHTTP(HTTP:THTTP);
    procedure Close; virtual;
    procedure Notify(AObject: Tobject; Operation: TOperation); virtual;
    procedure SocketEvent(Event:TSockEvent; Bytes: Int64);
    procedure Average(var AvgSend,AvgRecv:Int64);
    procedure StatClear;
    property SendBytes: Int64 read FSendBytes;
    property RecvBytes: Int64 read FRecvBytes;
    //Config
    property AgentHost: String read GetAgentHost write SetAgentHost;
    property AgentPort: Integer read FAgentPort write SetAgentPort;
    property TimeOut: Cardinal read FTimeOut write SetTimeOut;
    property DownloadTimeOut: Integer read FDownloadTimeOut write SetDownloadTimeOut;
    property Interval: Integer read FInterval write SetInterval;
    property MaxRedirect: Integer read FMaxRedirect write SetMaxRedirect;
    property MaxRecvByte: Integer read FMaxRecvByte write SetMaxRecvByte;
  end;

  function HostToIP(const Name:String):String;
  procedure LoadSSLLibrary;


HTML解析部分

const
  DoNotPush:set of TLabelEnum=[csTitle,csImg,csStyle,csScript,csInput,csTextArea];

  TLabelNames:Array [TLabelEnum] of  String = (
    'a','img','style','link','script','title','form','input','select',
    'button','textarea','option','');

  DelimiterWord:array[0..6] of String = ('','//','/*','*/','');
  DelimiterChar= ['<','>','/','\','=','&',';','"','''',' ',#9,#13,#10];
  WhiteSpaceChar= [' ',#9,#13,#10];

type

  TFindin = (fiAll,fiForms,fiFields,fiAnchors,fiImages,fiStyles,fiScripts);

  THTMLDocument = class;
  THTMLParser = class;


  THTMLElementCollection = class;
  THTMLElement = class
  private
    FID:String;
    FName:String;
    FTagName:String;
    FParent:THTMLElement;
    FChildElements:THTMLElementCollection;
    FDocument:THTMLDocument;
    FAttributes:TDictionary;
    procedure Assign(Target:THTMLElement); virtual;
    function GetInnerHTML:String;
    procedure SetInnerHTML(val:String);
    function GetOuterHTML:string;
    procedure SetOuterHTML(val:String);
    procedure AppendChildElement(Element:THTMLElement);
    procedure RemoveChildElement(Element:THTMLElement);
  public
    constructor Create(Parent:THTMLElement); virtual;
    destructor Destroy; override;
    function Clone:THTMLElement; virtual;
    procedure SetAttribute(AName,AValue:String); virtual;
    function GetAttribute(AName:String):String; virtual;
    function GetElementById(AID:String):THTMLElement;
    function GetElementsByName(AName:String):THTMLElementCollection;
    function GetElementsByTagName(ATagName:String):THTMLElementCollection;
    function HasChildElement(Child:THTMLElement):Boolean;

    property Document:THTMLDocument read FDocument write FDocument;
    property Attributes: TDictionary read FAttributes;
    property ChildElements: THTMLElementCollection read FChildElements;
    property Attribute[Name:String]:string read GetAttribute write SetAttribute;
    property Parent:THTMLElement read FParent;
    property ParentNode:THTMLElement read FParent;
    property ID:String read FID write FID;
    property Name:String read FName write FName;
    property TagName:String read FTagName write FTagName;
    property InnerHTML:String read GetInnerHTML write SetInnerHTML;
    property OuterHTML:String read GetOuterHTML write SetOuterHTML;
  end;
  THTMLClass = class of THTMLElement;

  THTMLCollection = class(TList)
  private
    FDocument:THTMLDocument;
    function GetCount: Integer;
  public
    function Item(Index:Integer): T;
    property Length:Integer read GetCount;
    property Document:THTMLDocument read FDocument write FDocument;
  end;

  THTMLScript = class(THTMLElement)
  private
    FSrc:String;
    FType:String;
    FText:String;
    FLanguage:String;
  public
    function GetAttribute(AName:String):String; override;
    procedure SetAttribute(AName,AValue:String); override;

    property Src:String read FSrc write FSrc;
    property AType:String read FType write FType;
    property Text:String read FText write FText;
    property Language:String read FLanguage write FLanguage;
  end;

  THTMLStyle = class(THTMLElement)
  private
    FCSSText :String;
  public
    function GetAttribute(AName:String):String; override;
    procedure SetAttribute(AName,AValue:String); override;
    property CSSText:String read FCSSText write FCSSText;
  end;

  THTMLImage = class(THTMLElement)
  private
    FAlt:String;
    FTitle:String;
    FSrc:String;
    function GetTextValue: string;
  public
    function GetAttribute(AName:String):String; override;
    procedure SetAttribute(AName,AValue:String); override;
    property Alt:String read FAlt write FAlt;
    property Title:String read FTitle write FTitle;
    property Src:String read FSrc write FSrc;
    property TextValue:string read GetTextValue;
  end;

  THTMLAnchor =class(THTMLElement)
  private
    FHref:String;
    FDisplay:String;
  public
    function GetAttribute(AName:String):String; override;
    procedure SetAttribute(AName,AValue:String); override;
    property Href:String read FHref write FHref;
    property Display:String read FDisplay write FDisplay;
  end;

  THTMLForm = class;
  THTMLField = class(THTMLElement)
  private
    FValue:String;
    FDisplay:String;
    procedure Assign(Target:THTMLElement); override;
    function GetActive:Boolean; virtual; abstract;
  protected
    FParentForm:THTMLForm;
  public
    destructor Destroy; override;
    function GetAttribute(AName:String):String; override;
    procedure SetAttribute(AName,AValue:String); override;
    property Active:Boolean read GetActive;
    property ParentForm:THTMLForm read FParentForm;
    property Value:String read FValue write FValue;
    property Display:String read FDisplay write FDisplay;
  end;

  THTMLTextArea = class(THTMLField)
  private
    function GetActive:Boolean; override;
  public
    procedure Random;
  end;

  THTMLButton = class(THTMLField)
  private
    FType:String;
    procedure Assign(Target:THTMLElement); override;
    function GetActive:Boolean; override;
  public
    constructor Create(Parent:THTMLElement); override;
    function GetAttribute(AName:String):String; override;
    procedure SetAttribute(AName,AValue:String); override;
    property AType:String read FType write FType;
  end;

  THTMLSelect = class;
  THTMLOption = class(THTMLField)
  private
    FSelected: Boolean;
    FParentSelect: THTMLSelect;
    procedure Assign(Target:THTMLElement); override;
    function GetActive:Boolean; override;
    procedure SetSelected(val:Boolean);
  public
    destructor Destroy; override;
    function GetAttribute(AName:String):String; override;
    procedure SetAttribute(AName,AValue:String); override;
    property ParentSelect:THTMLSelect read FParentSelect;
    property Selected: Boolean read FSelected write SetSelected;
  end;

  THTMLInput = class(THTMLField)
  private
    FType:String;
    FChecked: Boolean;
    procedure Assign(Target:THTMLElement); override;
    function GetActive:Boolean; override;
    procedure SetChecked(val:Boolean);
  public
    constructor Create(Parent:THTMLElement); override;
    function GetAttribute(AName:String):String; override;
    procedure SetAttribute(AName,AValue:String); override;
    procedure Random;
    property AType:string read FType write FType;
    property Checked: Boolean read FChecked write SetChecked;
  end;

  THTMLOptionCollection = class;
  THTMLSelect = class(THTMLField)
  private
    FSelected: THTMLOption;
    FOptions: THTMLOptionCollection;
    procedure Assign(Target:THTMLElement); override;
    function GetActive:Boolean; override;
    procedure SetSelected(val: THTMLOption);
    procedure SetParentFrom(const Value: THTMLForm);
  public
    constructor Create(Parent:THTMLElement); override;
    destructor Destroy; override;

    procedure Random;
    procedure AddOption(val:THTMLOption);
    procedure Notify(Element: THTMLOption; Operation: TOperation);
    property ParentFrom:THTMLForm read FParentForm write SetParentFrom;
    property Selected:THTMLOption read FSelected write SetSelected;
    property Options:THTMLOptionCollection read FOptions;
  end;  

  TRadioFields = Array of THTMLInput;
  THTMLFieldCollection = class;

  THTMLForm = class(THTMLElement)
  private
    FMethod:String;
    FENCType:String;
    FAction:String;
    FReferer:String;
    FFields:THTMLFieldCollection;
    procedure Assign(Target:THTMLElement); override;
    function GetRadioFields(AName:String):TRadioFields;
    function GetActiveFields(AName:String):THTMLField;
    function GetFieldValues(AName:String):String;
    procedure SetFieldValues(AName,AValue:String);
  public
    constructor Create(Parent:THTMLElement); override;
    destructor Destroy; override;

    procedure AddField(val:THTMLField);
    procedure Notify(Element: THTMLField; Operation: TOperation);
    function Find(Keys,Values:string):THTMLField; overload;
    function Find(Keys,Values:string; Match:TMatch):THTMLField; overload;
    function Find(Keys,Values:array of string):THTMLField; overload;
    function Find(Keys,Values:array of string; Match:TMatch):THTMLField; overload;
    function GetAttribute(AName:String):String; override;
    procedure SetAttribute(AName,AValue:String); override;
    function Submit(PostField:array of String; const ButtonName:string=''):Integer;
    function CheckBox(Checked:Boolean):Boolean; overload;
    function CheckBox(AName:String; Checked:Boolean):Boolean; overload;
    property Fields:THTMLFieldCollection read FFields;
    property ActiveFields[Name:String]:THTMLField read GetActiveFields;
    property RadioFields[Name:String]:TRadioFields read GetRadioFields;
    property FieldValues[Name:String]:String read GetFieldValues write SetFieldValues;
    property Method:String read FMethod write FMethod;
    property ENCType:String read FENCType write FENCType;
    property Action:String read FAction write FAction;
  end;

  THTMLElementCollection = class(THTMLCollection);
  THTMLScriptCollection = class(THTMLCollection);
  THTMLStyleCollection = class(THTMLCollection);
  THTMLImageCollection = class(THTMLCollection);
  THTMLAnchorCollection = class(THTMLCollection);
  THTMLFieldCollection = class(THTMLCollection);
  THTMLOptionCollection = class(THTMLCollection);
  THTMLFormCollection = class(THTMLCollection)
  public
    function Find(Keys,Values:String):THTMLForm; overload;
    function Find(Keys,Values:String; Match:TMatch):THTMLForm; overload;
    function Find(Keys,Values:array of String; Match:TMatch):THTMLForm; overload;
    function FindByField(Keys,Value:String):THTMLForm; overload;
    function FindByField(Keys,Value:String; Match:TMatch):THTMLForm; overload;
    function FindByField(Keys,Values:array of String; Match:TMatch):THTMLForm; overload;
    function FindWithField(FormKeys,FormValues:String;
                           FieldKeys,FieldValues:String):THTMLForm; overload;
    function FindWithField(FormKeys,FormValues:String; FormMatch:TMatch;
                           FieldKeys,FieldValues:String; FieldMatch:TMatch):THTMLForm; overload;
    function FindWithField(FormKeys,FormValues:array of String; FormMatch:TMatch;
                           FieldKeys,FieldValues:array of String; FieldMatch:TMatch):THTMLForm; overload;
  end;

  TEvalEvent = function(Sender:TObject; Script:string):Variant of object;
  TSubmitEvent = procedure(Sender:TObject; Return:Integer) of object;
  TChangedEvent = procedure(Sender:TObject; Obj:THTMLElement; Operation:TOperation) of object;

  THTMLDocument = class
  private
    FHTTP:THTTP;
    FURL:String;
    FTitle:String;
    FSource:String;
    FDisplay:String;
    FRoot:THTMLElementCollection;
    FAll:THTMLElementCollection;
    FForms:THTMLFormCollection;
    FFields:THTMLFieldCollection;
    FAnchors:THTMLAnchorCollection;
    FImages:THTMLImageCollection;
    FStyles:THTMLStyleCollection;
    FScripts:THTMLScriptCollection;
    FFinds:THTMLElementCollection;
    FOnEval: TEvalEvent;
    FOnSubmit:TSubmitEvent;
    FOnChanged:TChangedEvent;

    function GetMetaRefresh: string;
  public
    constructor Create(AHTTP:THTTP);
    destructor Destroy; override;
    procedure Clear;
    procedure Notify(Element: THTMLElement; Operation: TOperation); virtual;

    function Find(Keys,Values:array of String):THTMLElement; overload;
    function Find(Keys,Values:array of String; Match:TMatch):THTMLElement; overload;
    function Find(Keys,Values:array of String; FindIn:TFindin):THTMLElement; overload;
    function Find(Keys,Values:array of String; Parent:THTMLElement):THTMLElement; overload;
    function Find(Keys,Values:array of String; Match:TMatch; FindIn:TFindin):THTMLElement; overload;
    function Find(Keys,Values:array of String; Match:TMatch; Parent:THTMLElement):THTMLElement; overload;
    function Find(Keys,Values:array of String; Match:TMatch; FindIn:TFindin; Parent:THTMLElement):THTMLElement; overload;
    function Finds(Keys,Values:array of String):THTMLElementCollection; overload;
    function Finds(Keys,Values:array of String; Match:TMatch):THTMLElementCollection; overload;
    function Finds(Keys,Values:array of String; FindIn:TFindin):THTMLElementCollection; overload;
    function Finds(Keys,Values:array of String; Parent:THTMLElement):THTMLElementCollection; overload;
    function Finds(Keys,Values:array of String; Match:TMatch; FindIn:TFindin):THTMLElementCollection; overload;
    function Finds(Keys,Values:array of String; Match:TMatch; Parent:THTMLElement):THTMLElementCollection; overload;
    function Finds(Keys,Values:array of String; Match:TMatch; FindIn:TFindin; Parent:THTMLElement):THTMLElementCollection; overload;

    property URL:String read FURL;
    property Title:String read FTitle;
    property Source:String read FSource;
    property Display:String read FDisplay;
    property All:THTMLElementCollection read FAll;
    property Root:THTMLElementCollection read FRoot;
    property Forms:THTMLFormCollection read FForms;
    property Fields:THTMLFieldCollection read FFields;
    property Anchors:THTMLAnchorCollection read FAnchors;
    property Images:THTMLImageCollection read FImages;
    property Styles:THTMLStyleCollection read FStyles;
    property Scripts:THTMLScriptCollection read FScripts;
    property MetaRefresh:string read GetMetaRefresh;
    property OnEval:TEvalEvent read FOnEval write FOnEval;
    property OnSubmit:TSubmitEvent read FOnSubmit write FOnSubmit;
    property OnChanged:TChangedEvent read FOnChanged write FOnChanged;
  end;

  THTMLToken = class
  private
    FIndex: Integer;
    FTokenList: TList;
    function GetEOF:Boolean;
    function GetCurrToken:String;
    function GetNextToken:string;
    function GetNextNextToken:string;
    function GetPrevToken:String;
    function GetToken(Index:Integer):String;
  public
    constructor Create;
    destructor Destroy; override;   

    procedure Clear;
    procedure Process(Source:String);

    function IsSpace(Token:String):Boolean;
    function MoveNext:Boolean;
    function SkipSpace:Boolean;
    function SkipToken(Token:String):Boolean;
    function Preview(Tokens:array of String):Boolean;
    function MatchToken(Token:String):Boolean; overload;
    function MatchToken(Tokens:array of String):Boolean; overload;

    property EOF:Boolean read GetEOF;
    property CurrToken:String read GetCurrToken;
    property NextToken:String read GetNextToken;
    property NextNextToken:String read GetNextNextToken;
    property PrevToken:String read GetPrevToken;
    property Items[Index:Integer]: String read GetToken; default;
  end;

  TParseEvent = procedure(Sender:TObject) of object;

  THTMLParser = class
  type
    TransferType = (ttNone,ttHTML,ttScript,ttStyle);
    BlockType = (btNone,btRegExp,btString,btComments);
  private
    FOwner:THTMLParser;
    FDocument:THTMLDocument;
    FLastContent:String;
    FBlock: BlockType;
    FEndBlock: TList;
    FTransferType:TransferType;
    FTokenList:THTMLToken;
    FStack:TStack;

    procedure CheckBlock;
    function GetValue:String;
    function GetTransfer:String;
    function GetInner(TagName:String):String;
    function CheckName(Name:String):Boolean;

    procedure Pop(T:TClass);
    function Peek:THTMLElement;
    procedure Push(Element:THTMLElement);
    procedure AddContent(Text:String);
    procedure Addobject(Element:THTMLElement);
    procedure UpdateParent(Text:String; Element:THTMLElement);
    function LabelInfo(TagName:String; var LabelClass:THTMLClass):TLabelEnum;

    procedure Process(ASource:String);
    procedure ProcessToken;
    procedure ProcessLabel;
    procedure ProcessCDATA;
    procedure ProcessContent;
    procedure ProcessComments;
    procedure ProcessLabelOpen(Name:String);
    procedure ProcessLabelClose(Name:String);
    function ProcessLabelValue(Element:THTMLElement):Boolean;
  public
    constructor Create(ADocument:THTMLDocument; const AOwner:THTMLParser=nil);
    destructor Destroy; override;
    procedure Clear;
  end;

  THTMLEngine = class(THTMLDocument)
  private
    FCharset:String;
    FParser:THTMLParser;
    FRawCode:RawByteString;
    FMetaRefreshEnable:Boolean;
    FOnBeforeParse:TParseEvent;
    FOnAfterParse:TParseEvent;

    function GetCharset:String; overload;
    procedure SetCharset(const Value: String);
  public
    constructor Create(AHTTP:THTTP);
    destructor Destroy; override;

    procedure Clear;
    procedure Load(AUrl:string; ARawCode:RawByteString);
    procedure InsertSource(AddCode:String);

    function Translate(const Text:string):String;
    function GetCharset(const RawCode:RawByteString):String; overload;
    function Decoding(const RawCode:RawByteString):String; overload;
    function Decoding(const RawCode:RawByteString; ACharSet:String):String; overload;
    property Charset:String read FCharset write SetCharset;
    property MetaRefreshEnable:Boolean read FMetaRefreshEnable write FMetaRefreshEnable;
    property OnBeforeParse:TParseEvent read FOnBeforeParse write FOnBeforeParse;
    property OnAfterParse:TParseEvent read FOnAfterParse write FOnAfterParse;
  end;

 

Javascript模拟部分

type
  TScriptEngine=Class;

  TScriptBase = class
  private
    FHTML:THTMLEngine;
    FScript:TScriptEngine;
  public
    constructor Create(AScriptEngine:TScriptEngine);
    property HTMLEngine:THTMLEngine read FHTML write FHTML;
    property ScriptEngine:TScriptEngine read FScript write FScript;
  end;

  TScriptNavigator = class(TScriptBase)
  private
    FappCodeName,FappMinorVersion,FappName,FappVersion,
    FbrowserLanguage,FcookieEnabled,FcpuClass,FonLine,
    Fplatform,FsystemLanguage,FuserAgent,FuserLanguage:Variant;
  public
    constructor Create(AScriptEngine:TScriptEngine);
    function javaEnabled:Boolean;
    function taintEnabled:Boolean;

    property appCodeName:Variant read FappCodeName write FappCodeName;
    property appMinorVersion:Variant read FappMinorVersion write FappMinorVersion;
    property appName:Variant read FappName write FappName;
    property appVersion:Variant read FappVersion write FappVersion;
    property browserLanguage:Variant read FbrowserLanguage write FbrowserLanguage;
    property cookieEnabled:Variant read FcookieEnabled write FcookieEnabled;
    property cpuClass:Variant read FcpuClass write FcpuClass;
    property onLine:Variant read FonLine write FonLine;
    property platform:Variant read  Fplatform write Fplatform;
    property systemLanguage:Variant read  FsystemLanguage write FsystemLanguage;
    property userAgent:Variant read FuserAgent write FuserAgent;
    property userLanguage:Variant read FuserLanguage write FuserLanguage;
  end;

  TScriptScreen = class(TScriptBase)
  private
    FavailHeight,FavailWidth,FbufferDepth,FcolorDepth,
    FdeviceXDPI,FdeviceYDPI,FfontSmoothingEnabled,
    Fheight,FlogicalXDPI,FlogicalYDPI,FpixelDepth,
    FupdateInterval,Fwidth:Variant;
  public
    constructor Create(AScriptEngine:TScriptEngine);

    property availHeight:Variant read FavailHeight write FavailHeight;
    property availWidth:Variant read FavailWidth write FavailWidth;
    property bufferDepth:Variant read FbufferDepth write FbufferDepth;
    property colorDepth:Variant read FcolorDepth write FcolorDepth;
    property deviceXDPI:Variant read FdeviceXDPI write FdeviceXDPI;
    property deviceYDPI:Variant read FdeviceYDPI write FdeviceYDPI;
    property fontSmoothingEnabled:Variant read FfontSmoothingEnabled write FfontSmoothingEnabled;
    property height:Variant read Fheight write Fheight;
    property logicalXDPI:Variant read FlogicalXDPI write FlogicalXDPI;
    property logicalYDPI:Variant read FlogicalYDPI write FlogicalYDPI;
    property pixelDepth:Variant read FpixelDepth write FpixelDepth;
    property updateInterval:Variant read FupdateInterval write FupdateInterval;
    property width:Variant read Fwidth write Fwidth;
  end;

  TScriptHistory = class(TScriptBase)
  private
    Flength:Integer;
  public
     constructor Create(AScriptEngine:TScriptEngine);
     property length:Integer read Flength write Flength;
     procedure back();
     procedure forward();
     procedure go();
  end;

  TScriptLocation = class(TScriptBase)
  private
    Fhash,Fhost,Fhostname,Fhref,Fpathname,Fport,Fprotocol,Fsearch:Variant;
    procedure SetHref(const Value: Variant);
  public
    constructor Create(AScriptEngine:TScriptEngine);
    procedure assign(URL:Variant);
    procedure reload();
    procedure replace(URL:Variant);

    property hash:Variant read Fhash write Fhash;
    property host:Variant read Fhost write Fhost;
    property hostname:Variant read Fhostname write Fhostname;
    property href:Variant read Fhref write SetHref;
    property pathname:Variant read Fpathname write Fpathname;
    property port:Variant read Fport write Fport;
    property protocol:Variant read Fprotocol write Fprotocol;
    property search:Variant read Fsearch write Fsearch;
  end;

  TScriptDocument = class(TScriptBase)
  private
    Fall,Fanchors,Fapplets,Fforms,Fimages,Flinks,Fbody:Variant;
    Fcookie,Fdomain,FlastModified,Freferrer,Ftitle,FURL:Variant;
  public
    constructor Create(AScriptEngine:TScriptEngine);
    procedure close();
    function getElementById(id:Variant):Variant;
    function getElementsByName(name:Variant):Variant;
    function getElementsByTagName(tagname:Variant):Variant;
    function open(mimetype,replace:Variant):Variant;
    procedure write(exp:Variant);
    procedure writeln(exp:Variant);

    property cookie:Variant read Fcookie write Fcookie;
    property domain:Variant read Fdomain write Fdomain;
    property lastModified:Variant read FlastModified write FlastModified;
    property referrer:Variant read Freferrer write Freferrer;
    property title:Variant read Ftitle write Ftitle;
    property URL:Variant read FURL write FURL;
    property all:Variant read Fall write Fall;
    property anchors:Variant read Fanchors write Fanchors;
    property applets:Variant read Fapplets write Fapplets;
    property forms:Variant read Fforms write Fforms;
    property images:Variant read Fimages write Fimages;
    property links:Variant read Flinks write Flinks;
    property body:Variant read Fbody write Fbody;
  end;

  TScriptWindow = class(TScriptBase)
  private
    ObjDocument:TScriptDocument;
    ObjHistory:TScriptHistory;
    ObjNavigator:TScriptNavigator;
    ObjScreen:TScriptScreen;
    ObjLocation:TScriptLocation;
    FTemp,
    Fclosed,FdefaultStatus,Finnerheight,Finnerwidth,Flength,
    Fname,Fouterheight,Fouterwidth,FpageXOffset,FpageYOffset,
    Fstatus,FscreenLeft,FscreenTop,FscreenX,FscreenY,
    Fframes,Fdocument,Fhistory,FNavigator,FScreen,Fwindow,Flocation:Variant;
    procedure SetLocation(const Value: Variant);
  public
    constructor Create(AScriptEngine:TScriptEngine);
    destructor Destroy; override;

    procedure alert(msg:Variant);
    procedure blur();
    procedure clearInterval(id:Variant);
    procedure clearTimeout(id:Variant);
    procedure close();
    procedure confirm(msg:Variant);
    function createPopup():Variant;
    procedure focus();
    procedure moveBy(x,y:Variant);
    procedure moveTo(x,y:Variant);
    function open(url,name,features,replace:Variant):Variant;
    procedure print();
    function prompt(text,defaultText:Variant):Variant;
    procedure resizeBy(width,height:Variant);
    procedure resizeTo(width,height:Variant);
    procedure scrollBy(xnum,ynum:Variant);
    procedure scrollTo(xpos,ypos:Variant);
    function setInterval(code,millisec:Variant):Variant;
    function setTimeout(code,millisec:Variant):Variant;
    function getElementById(id:Variant):Variant;
    function getElementsByName(name:Variant):Variant;
    function getElementsByTagName(tagname:Variant):Variant;

    property closed:Variant read Fclosed write Fclosed;
    property defaultStatus:Variant read FdefaultStatus write FdefaultStatus;
    property innerheight:Variant read Finnerheight write Finnerheight;
    property innerwidth:Variant read Finnerwidth write Finnerwidth;
    property length:Variant read Flength write Flength;
    property name:Variant read Fname write Fname;
    property outerheight:Variant read Fouterheight write Fouterheight;
    property outerwidth:Variant read Fouterwidth write Fouterwidth;
    property pageXOffset:Variant read FpageXOffset write FpageXOffset;
    property pageYOffset:Variant read FpageYOffset write FpageYOffset;
    property status:Variant read Fstatus write Fstatus;
    property screenLeft:Variant read FscreenLeft write FscreenLeft;
    property screenTop:Variant read FscreenTop write FscreenTop;
    property screenX:Variant read FscreenX write FscreenX;
    property screenY:Variant read FscreenY write FscreenY;

    property frames:Variant read Fframes write Fframes;
    property self:Variant read Fwindow write FTemp;
    property opener:Variant read Fwindow write FTemp;
    property top:Variant read Fwindow write FTemp;
    property window:Variant read Fwindow write FTemp;
    property document:Variant read Fdocument write FTemp;
    property history:Variant read Fhistory write Fhistory;
    property Navigator:Variant read FNavigator write FNavigator;
    property Screen:Variant read FScreen write FScreen;
    property location:Variant read Flocation write SetLocation;
  end;

  TScriptLanguage = (slJavaScript,slVBScript);

  TScriptEngine=Class
  private
    FHTTP:THTTP;
    FDocument:THTMLEngine;
    FWindow:TScriptWindow;
    FRefManage:TRefManage;
    FLanguage:TScriptLanguage;
    FVBScriptEngine:Variant;
    FJavaScriptEngine:Variant;
    FNeedRedirect:Boolean;
    FRedirectUrl:String;
    procedure CreateInstance;
  public
    constructor Create(AHTTP:THTTP; AHTML:THTMLEngine);
    destructor Destroy; override;
    procedure Reset;
    procedure BeforeProcess(Sender:TObject);
    procedure AfterProcess(Sender:TObject);
    function ProcessEval(Sender:TObject; Script:string):Variant;
    procedure ProcessObject(Sender:TObject; Obj:THTMLElement; Operation:TOperation);
    procedure Execute(const Code:string; ALanguage:TScriptLanguage);
    function Eval(const Expression:string; ALanguage:TScriptLanguage):Variant;
    function Transform(Obj:TObject; Owned:Boolean=False):Variant;
    property Document:THTMLEngine read FDocument write FDocument;
    property Language:TScriptLanguage read FLanguage;
  End;

三个部分协调控制类
type
  TWebbotSupport = set of (wsHTTPS,wsHTML,wsScript);

  TWebbotManage = class;
  TWebbot = class
  private
    FHTTP:THTTP;
    FDocument:THTMLEngine;
    FCookies:TCookies;
    FFreeCookies:Boolean;
    FScript:TScriptEngine;
    FOwner:TWebbotManage;
    FSupport:TWebbotSupport;
    procedure OnSubmit(Sender:TObject; Return:Integer);
  public
    constructor Create(const ASupport:TWebbotSupport=[wsHTTPS,wsHTML,wsScript]; const ACookies:TCookies=nil; const AOwner:TWebbotManage=nil);
    destructor Destroy; override;
    function Clone:TWebbot;
    function Get(Url:string):Integer;
    function Post(Url:string; PostData:AnsiString):Integer; overload;
    function Post(Url:string; PostField:array of String):Integer; overload;

    property HTTP:THTTP read FHTTP;
    property Document:THTMLEngine read FDocument;
    property Cookies:TCookies read FCookies;
    property Script:TScriptEngine read FScript;
  end;

  TWebbotManage = class(THTTPManage)
  private
    FWebbotList:TList<TWebbot>;
    FSupport:TWebbotSupport;
  public
    constructor Create(const ASupport:TWebbotSupport=[wsHTTPS,wsHTML,wsScript]); overload;
    destructor Destroy; override;

    procedure Notify(AObject: Tobject; Operation: TOperation); override;
    function CreateWebbot(const Cookies:TCookies=nil):TWebbot;
    procedure FreeWebbot(Browser:TWebbot);
  end;

 

posted @ 2010-09-04 14:42  庄园  阅读(3058)  评论(5编辑  收藏  举报