Delphi高效的字符串处理

unit FReplace;

interface

Type

TFastPosProc = function(

const aSourceString, aFindString : String;

const aSourceLen, aFindLen, StartPos : integer

) : integer;

function FastReplace(

var aSourceString : String;

const aFindString, aReplaceString : String;

CaseSensitive : Boolean = False) : String;

function FastPos(

const aSourceString, aFindString : String;

const aSourceLen, aFindLen, StartPos : integer

) : integer;

function FastPosNoCase(

const aSourceString, aFindString : String;

const aSourceLen, aFindLen, StartPos : integer

) : integer;

function FastPosNoCaseNoUpcaseFindString(

const aSourceString, aFindString : String;

const aSourceLen, aFindLen, StartPos : integer

) : integer;

function IsBeginOfString(

const aSubString,aSourceString:String;

const aSubLen,aSourceLen:integer

):boolean;

implementation

function IsBeginOfString(

const aSubString,aSourceString:String;

const aSubLen,aSourceLen:integer

):boolean;

begin

if (aSourceLen < aSubLen) or (aSubLen = 0) then

begin

result := false;

exit;

end;

asm

push ESI

push EDI

push EBX

//如果aSourceLen小于aFindLen，退出

Mov ECX, aSourceLen

Mov EAX, aSubLen

Sub ECX, EAX

JL @Result0

mov EDI, aSourceString

mov ESI, aSubString

//比较第一个字母

Mov Al, [ESI]

Mov Ah, [EDI]

cmp Ah,Al

//不相同就直接退出

jne @Result0

//相同，开始比较字符串

mov EBX, aSubLen

//取SubString最后一个字符和SourceString对应字符

dec EBX

//如果这时候已经遇到0，表示匹配结束（第一个字符已经比较）

Jz @EndOfMatch

@CompareNext:

// 取SubString最后一个字符

mov Al, [ESI+EBX]

// 取SourceString对应字符

mov Ah, [EDI+EBX]

// 比较

cmp Al, Ah

// 如果不一样，退出

jne @Result0

// 如果一样，EBX减一

Dec EBX

// 如果EBX <> 0 ("J"ump "N"ot "Z"ero),

// 继续比较

Jnz @CompareNext

// EBX等于0，比较结束。

@EndOfMatch:

mov Result, 1

jmp @TheEnd

@Result0:

mov Result, 0

@TheEnd:

pop EBX

pop EDI

pop ESI

end;

// This TYPE declaration will become apparent later.

//The first thing to note here is that I’m passing the SourceLength and

FindL

//ength. As neither Source nor Find will alter at any point during

FastReplace

//, there’s no need to call the LENGTH subroutine each time!

function FastPos(

const aSourceString, aFindString : String;

const aSourceLen, aFindLen, StartPos : integer

) : integer;

begin

// Next, we determine how many bytes we need to

// scan to find the "start" of aFindString.

// Remove by SunLujiang

{

SourceLen := aSourceLen;

SourceLen := SourceLen - aFindLen;

if (StartPos-1) > SourceLen then begin

Result := 0;

Exit;

end;

SourceLen := SourceLen - StartPos;

SourceLen := SourceLen +2;

}

// Remove end

// The ASM starts here.

asm

// Delphi uses ESI, EDI, and EBX a lot,

// so we must preserve them.

push ESI

push EDI

push EBX

// Add by SunLujiang

Mov ECX, aSourceLen

Mov EAX, aFindLen

Sub ECX, EAX

JL @Result0

Mov EAX, StartPos

Dec EAX

Sub ECX, EAX

JL @Result0

Inc ECX

// Add end

// Get the address of sourceString[1]

// and Add (StartPos-1).

// We do this for the purpose of finding

// the NEXT occurrence, rather than

// always the first!

mov EDI, aSourceString

add EDI, StartPos

Dec EDI

// Get the address of aFindString.

mov ESI, aFindString

// Note how many bytes we need to

// look through in aSourceString

// to find aFindString.

// Remove by SunLujiang

// mov ECX, SourceLen

// Remove end

// Get the first char of aFindString;

// note how it is done outside of the

// main loop, as it never changes!

Mov Al, [ESI]

// Now the FindFirstCharacter loop!

@ScaSB:

// Get the value of the current

// character in aSourceString.

// This is equal to ah := EDI^, that

// is what the [] are around [EDI].

Mov Ah, [EDI]

// Compare this character with aDestString[1].

cmp Ah,Al

// If they're not equal we don't

// compare the strings.

jne @NextChar

// If they're equal, obviously we do!

@CompareStrings:

// Put the length of aFindLen in EBX.

mov EBX, aFindLen

// We DEC EBX to point to the end of

// the string; that is, we don't want to

// add 1 if aFindString is 1 in length!

dec EBX

// add by ShengQuanhu

// If EBX is zero, then we've successfully

// compared each character; i.e. it's A MATCH!

// It will be happened when aFindLen=1

Jz @EndOfMatch

//add end

//Here’s another optimization tip. People at this point usually PUSH

ESI and

//so on and then POP ESI and so forth at the end–instead, I opted not

to chan

//ge ESI and so on at all. This saves lots of pushing and popping!

@CompareNext:

// Get aFindString character +

// aFindStringLength (the last char).

mov Al, [ESI+EBX]

// Get aSourceString character (current

// position + aFindStringLength).

mov Ah, [EDI+EBX]

// Compare them.

cmp Al, Ah

Jz @Matches

// If they don't match, we put the first char

// of aFindString into Al again to continue

// looking for the first character.

Mov Al, [ESI]

Jmp @NextChar

@Matches:

// If they match, we DEC EBX (point to

// previous character to compare).

Dec EBX

// If EBX <> 0 ("J"ump "N"ot "Z"ero), we

// continue comparing strings.

Jnz @CompareNext

//add by Shengquanhu

@EndOfMatch:

//add end

// If EBX is zero, then we've successfully

// compared each character; i.e. it's A MATCH!

// Move the address of the *current*

// character in EDI.

// Note, we haven't altered EDI since

// the first char was found.

mov EAX, EDI

// This is an address, so subtract the

// address of aSourceString[1] to get

// an actual character position.

sub EAX, aSourceString

// Inc EAX to make it 1-based,

// rather than 0-based.

inc EAX

// Put it into result.

mov Result, EAX

// Finish this routine!

jmp @TheEnd

@NextChar:

//This is where I jump to when I want to continue searching for the

first char

//acter of aFindString in aSearchString:

// Point EDI (aFindString[X]) to

// the next character.

Mov Ah, [EDI]//先把第一个字符移到Ah中，后面判断是否中文

Inc EDI

// Dec ECX tells us that we've checked

// another character, and that we're

// fast running out of string to check!

dec ECX

// If EBX <> 0, then continue scanning

// for the first character.

//add by shengquanhu

//if ah is chinese char,jump again

jz @Result0

cmp ah, $80

jb @ScaSB

Inc EDI

Dec ECX

//add by shengquanhu end

jnz @ScaSB

//add by shengquanhu

@Result0:

//add by shengquanhu end

// If EBX = 0, then move 0 into RESULT.

mov Result,0

// Restore EBX, EDI, ESI for Delphi

// to work correctly.

// Note that they're POPped in the

// opposite order they were PUSHed.

@TheEnd:

pop EBX

pop EDI

pop ESI

end;

//This routine is an identical copy of FastPOS except where commented!

The ide

//a is that when grabbing bytes, it ANDs them with $df, effectively

making the

//m lowercase before comparing. Maybe this would be quicker if

aFindString was

// made lowercase in one fell swoop at the beginning of the function,

saving a

//n AND instruction each time.

function FastPosNoCase(

const aSourceString, aFindString : String;

const aSourceLen, aFindLen, StartPos : integer

) : integer;

//var

// SourceLen:integer;

begin

// Remove by SunLujiang

{

SourceLen := aSourceLen;

SourceLen := SourceLen - aFindLen;

if (StartPos-1) > SourceLen then begin

Result := 0;

Exit;

end;

SourceLen := SourceLen - StartPos;

SourceLen := SourceLen +2;

}

// Remove by SunLujiang end

asm

push ESI

push EDI

push EBX

// Add by SunLujiang

Mov ECX, aSourceLen

Mov EAX, aFindLen

Sub ECX, EAX

JL @Result0

Mov EAX, StartPos

Dec EAX

Sub ECX, EAX

JL @Result0

Inc ECX

// Add end

mov EDI, aSourceString

add EDI, StartPos

Dec EDI

mov ESI, aFindString

// Remove by SunLujiang

// mov ECX, SourceLen

// Remove by SunLujiang end

Mov Al, [ESI]

//add by shengquanhu:just modified the lowercase 'a'..'z'

cmp Al, $7A

ja @ScaSB

cmp Al, $61

jb @ScaSB

//end------------------------------------------

// Make Al uppercase.

and Al, $df

@ScaSB:

Mov Ah, [EDI]

//add by shengquanhu:just modified the lowercase 'a'..'z'

cmp Ah, $7A

ja @CompareChar

cmp Ah, $61

jb @CompareChar

//end------------------------------------------

// Make Ah uppercase.

and Ah, $df

@CompareChar:

cmp Ah,Al

jne @NextChar

@CompareStrings:

mov EBX, aFindLen

dec EBX

//add by ShengQuanhu

Jz @EndOfMatch

//add end

@CompareNext:

mov Al, [ESI+EBX]

mov Ah, [EDI+EBX]

//add by shengquanhu:just modified the lowercase 'a'..'z'

cmp Al, $7A

ja @LowerAh

cmp Al, $61

jb @LowerAh

//end------------------------------------------

// Make Al and Ah uppercase.

and Al, $df

//add by shengquanhu:just modified the lowercase 'a'..'z'

@LowerAh:

cmp Ah, $7A

ja @CompareChar2

cmp Ah, $61

jb @CompareChar2

//end------------------------------------------

and Ah, $df

@CompareChar2:

cmp Al, Ah

Jz @Matches

Mov Al, [ESI]

//add by shengquanhu:just modified the lowercase 'a'..'z'

cmp Al, $7A

ja @NextChar

cmp Al, $61

jb @NextChar

//end------------------------------------------

// Make Al uppercase.

and Al, $df

Jmp @NextChar

@Matches:

Dec EBX

Jnz @CompareNext

//add by Shengquanhu

@EndOfMatch:

//add end

mov EAX, EDI

sub EAX, aSourceString

inc EAX

mov Result, EAX

jmp @TheEnd

@NextChar:

mov ah, [EDI]

Inc EDI

dec ECX

//add by shengquanhu

//if ah is chinese char,jump again

jz @Result0

cmp ah, $80

jb @ScaSB

Inc EDI

Dec ECX

//add by shengquanhu end

jnz @ScaSB

@Result0:

mov Result,0

@TheEnd:

pop EBX

pop EDI

pop ESI

end;

//add by shengquanhu

function FastPosNoCaseNoUpcaseFindString(

const aSourceString, aFindString : String;

const aSourceLen, aFindLen, StartPos : integer

) : integer;

begin

asm

push ESI

push EDI

push EBX

Mov ECX, aSourceLen

Mov EAX, aFindLen

Sub ECX, EAX

JL @Result0

Mov EAX, StartPos

Dec EAX

Sub ECX, EAX

JL @Result0

Inc ECX

mov EDI, aSourceString

add EDI, StartPos

Dec EDI

mov ESI, aFindString

Mov Al, [ESI]

@ScaSB:

Mov Ah, [EDI]

cmp Ah, $7A

ja @CompareChar

cmp Ah, $61

jb @CompareChar

and Ah, $df

@CompareChar:

cmp Ah,Al

jne @NextChar

@CompareStrings:

mov EBX, aFindLen

dec EBX

Jz @EndOfMatch

@CompareNext:

mov Al, [ESI+EBX]

mov Ah, [EDI+EBX]

cmp Ah, $7A

ja @CompareChar2

cmp Ah, $61

jb @CompareChar2

and Ah, $df

@CompareChar2:

cmp Al, Ah

Jz @Matches

Mov Al, [ESI]

Jmp @NextChar

@Matches:

Dec EBX

Jnz @CompareNext

@EndOfMatch:

mov EAX, EDI

sub EAX, aSourceString

inc EAX

mov Result, EAX

jmp @TheEnd

@NextChar:

Mov ah, [EDI]

Inc EDI

dec ECX

jz @Result0

cmp ah, $80

jb @ScaSB

Inc EDI

Dec ECX

jnz @ScaSB

@Result0:

mov Result,0

@TheEnd:

pop EBX

pop EDI

pop ESI

end;

//add by shengquanhu end

//My move isn’t as fast as MOVE when source and destination are both

DWord al

//igned, but it’s certainly faster when they’re not. As we’re

moving charac

//ters in a string, it isn’t very likely at all that both source and

destinat

//ion are DWord aligned, so moving bytes avoids the cycle penalty of

reading/w

//riting DWords across physical boundaries.

procedure MyMove(

const Source; var Dest; Count : Integer);

asm

// Note: When this function is called,

// Delphi passes the parameters as follows:

// ECX = Count

// EAX = Const Source

// EDX = Var Dest

// If there are no bytes to copy, just quit

// altogether; there's no point pushing registers.

cmp ECX,0

Je @JustQuit

// Preserve the critical Delphi registers.

push ESI

push EDI

// Move Source into ESI (generally the

// SOURCE register).

// Move Dest into EDI (generally the DEST

// register for string commands).

// This might not actually be necessary,

// as I'm not using MOVsb etc.

// I might be able to just use EAX and EDX;

// there could be a penalty for not using

// ESI, EDI, but I doubt it.

// This is another thing worth trying!

mov ESI, EAX

mov EDI, EDX

// The following loop is the same as repNZ

// MovSB, but oddly quicker!

@Loop:

// Get the source byte.

Mov AL, [ESI]

// Point to next byte.

Inc ESI

// Put it into the Dest.

mov [EDI], AL

// Point dest to next position.

Inc EDI

// Dec ECX to note how many we have left to copy.

Dec ECX

// If ECX <> 0, then loop.

Jnz @Loop

// Another optimization note.

// Many people like to do this.

// Mov AL, [ESI]

// Mov [EDI], Al

// Inc ESI

//There’s a hidden problem here. I won’t go into too much detail,

but the Pe

//ntium can continue processing instructions while it’s still working

out the

// result of INC ESI or INC EDI. If, however, you use them while they’

re stil

//l being calculated, the processor will stop until they’re

calculated (a pen

//alty). Therefore, I alter ESI and EDI as far in advance as possible of

using

// them.

// Pop the critical Delphi registers

// that we've altered.

pop EDI

pop ESI

@JustQuit:

end;

//Point 1: I pass VAR aSourceString rather than just aSourceString. This

is be

//cause I’ll just be passed a pointer to the data rather than a 10M

copy of t

//he data itself, which is much quicker!

function FastReplace(

var aSourceString : String;

const aFindString, aReplaceString : String;

CaseSensitive : Boolean = False) : String;

var

// Size already passed to SetLength,

// the REAL size of RESULT.

ActualResultLen,

// Position of aFindString is aSourceString.

CurrentPos,

// Last position the aFindString was found at.

LastPos,

// Bytes to copy (that is, lastpos to this pos).

BytesToCopy,

// The "running" result length, not the actual one.

ResultLen,

// Length of aFindString, to save

// calling LENGTH repetitively.

FindLen,

// Length of aReplaceString, for the same reason.

ReplaceLen,

SourceLen : Integer;

// This is where I explain the

// TYPE TFastPosProc from earlier!

FastPosProc : TFastPosProc;

//add by shengquanhu

theFindString :String;

//add by shengquanhu end

begin

//As this function has the option of being case-insensitive, I’d need

to call

// either FastPOS or FastPOSNoCase. The problem is that you’d have to

do this

// within a loop. This is a bad idea, since the result never changes

throughou

//t the whole operation–in which case we can determine it in advance,

like so

//:

// I don't think I actually need

// this, but I don't really mind!

Result := '';

// Get the lengths of the strings.

FindLen := Length(aFindString);

ReplaceLen := Length(aReplaceString);

SourceLen := Length(aSourceString);

//add by shengquanhu

if SourceLen < FindLen then

begin

result := aSourceString;

exit;

end;

theFindString := aFindString;

if CaseSensitive then

FastPosProc := FastPOS

else

begin

FastPOSProc := FastPOSNoCaseNoUpcaseFindString;

CurrentPos := 1;

while CurrentPos <= FindLen do

begin

if theFindString[CurrentPos] >= #$80 then

Inc(CurrentPos,1)

else if (theFindString[CurrentPos] > #$60) and

(theFindString[CurrentPos] < #$7B) then

theFindString[CurrentPos] :=

char(integer(theFindString[CurrentPos]) and $df);

inc(CurrentPos);

end;

//add by shengquanhu end

// If we already have room for the replacements,

// then set the length of the result to

// the length of the SourceString.

if ReplaceLen <= FindLen then

ActualResultLen := SourceLen

else

// If not, we need to calculate the

// worst-case scenario.

// That is, the Source consists ONLY of

// aFindString, and we're going to replace

// every one of them!

ActualResultLen :=

SourceLen +

(SourceLen * ReplaceLen div FindLen) +

ReplaceLen;

// Set the length of Result; this

// will assign the memory, etc.

SetLength(Result,ActualResultLen);

CurrentPos := 1;

ResultLen := 0;

LastPos := 1;

//Again, I’m eliminating an IF statement in a loop by repeating code–

this ap

//proach results in very slightly larger code, but if ever you can trade

some

//memory in exchange for speed, go for it!

if ReplaceLen > 0 then begin

repeat

// Get the position of the first (or next)

// aFindString in aSourceString.

// Note that there's no If CaseSensitive,

// I just call FastPOSProc, which is pointing

// to the correct pre-determined routine.

//add by shengquanhu

CurrentPos :=

FastPosProc(aSourceString, theFindString,

SourceLen, FindLen, CurrentPos);

//add by shengquanhu end;

// If 0, then we're finished.

if CurrentPos = 0 then

break;

// Number of bytes to copy from the

// source string is CurrentPos - lastPos,

// i.e. " cat " in "the cat the".

BytesToCopy := CurrentPos-LastPos;

// Copy chars from aSourceString

// to the end of Result.

MyMove(aSourceString[LastPos],

Result[ResultLen+1], BytesToCopy);

// Copy chars from aReplaceString to

// the end of Result.

MyMove(aReplaceString[1],

Result[ResultLen+1+BytesToCopy], ReplaceLen);

// Remember, using COPY would copy all of

// the data over and over again.

// Never fall into this trap (like a certain

// software company did).

// Set the running length to

ResultLen := ResultLen +

BytesToCopy + ReplaceLen;

// Set the position in aSourceString to where

// we want to continue searching from.

CurrentPos := CurrentPos + FindLen;

LastPos := CurrentPos;

until false;

end else begin

// You might have noticed If ReplaceLen > 0.

// Well, if ReplaceLen = 0, then we're deleting the

// substrings, rather than replacing them, so we

// don't need the extra MyMove from aReplaceString.

repeat

//add by shengquanhu

CurrentPos :=

FastPosProc(aSourceString, theFindString,

SourceLen, FindLen, CurrentPos);

//add by shengquanhu end;

if CurrentPos = 0 then break;

BytesToCopy := CurrentPos-LastPos;

MyMove(aSourceString[LastPos],

Result[ResultLen+1], BytesToCopy);

ResultLen := ResultLen +

BytesToCopy + ReplaceLen;

CurrentPos := CurrentPos + FindLen;

LastPos := CurrentPos;

until false;

end;

//Now that we’ve finished doing all of the replaces, I just need to

adjust th

//e length of the final result:

Dec(LastPOS);

//Now I set the length to the Length plus the bit of string left. That

is, " m

//at" when replacing "the" in "sat on the mat".

SetLength(Result, ResultLen + (SourceLen-LastPos));

// If there's a bit of string dangling, then

// add it to the end of our string.

if LastPOS+1 <= SourceLen then

MyMove(aSourceString[LastPos+1],

Result[ResultLen+1],SourceLen-LastPos);

end;

end.

posted @ 2010-07-01 11:25 Max Woods 阅读(1089) 评论(0) 编辑收藏举报

刷新页面返回顶部

Delphi高效的字符串处理

公告