The ZDetectUTF8Encoding of the ZEncoding unit has a bug
Posted: 23.12.2021, 12:46
The ZDetectUTF8Encoding of the ZEncoding unit has a bug. SizeOf(PCardinal) should be changed to SizeOf(Cardinal)
function ZDetectUTF8Encoding(Source: PAnsiChar; Len: NativeUInt): TEncodeType;
var
c: Byte;
EndPtr: PAnsichar;
begin
Result := etUSASCII;
if (Source = nil) or (Len = 0) then Exit;
EndPtr := Source + Len -SizeOf(Cardinal);
// skip leading US-ASCII part.
while Source <= EndPtr do //Check next quad
begin
if PCardinal(Source)^ and $80808080<>0 then Break; //break on first non USASCII sequence
inc(Source, SizeOf(Cardinal));
end;
Inc(EndPtr, SizeOf(Cardinal));
while Source <EndPtr do //Check bytes
begin
if Byte(Source^) >= $80 then break; //break on first non USASCII sequence
inc(Source);
end;
// If all character is US-ASCII, done.
if Source = EndPtr then exit;
while Source <EndPtr do
begin
c := Byte(Source^);
case c of
$00..$7F: //Ascii7
if (EndPtr-Source> SizeOf(PCardinal)) and (PCardinal(Source)^ and $80808080 = 0) then //Check quad block ASCII again
inc(Source, SizeOf(PCardinal)) //bug
else
Inc(Source);
$C2..$DF: // non-overlong 2-byte
if (Source+1 <EndPtr)
and (Byte((Source+1)^) in [$80..$BF]) then
Inc(Source, 2)
else
break;
$E0: // excluding overlongs
if (Source+2 <EndPtr)
and (Byte((Source+1)^) in [$A0..$BF])
and (Byte((Source+2)^) in [$80..$BF]) then
Inc(Source, 3)
else
break;
$E1..$EF: // straight 3-byte & excluding surrogates
if (Source+2 <EndPtr)
and (Byte((Source+1)^) in [$80..$BF])
and (Byte((Source+2)^) in [$80..$BF]) then
Inc(Source, 3)
else
break;
$F0: // planes 1-3
if (Source+3 <EndPtr)
and (Byte((Source+1)^) in [$90..$BF])
and (Byte((Source+2)^) in [$80..$BF])
and (Byte((Source+3)^) in [$80..$BF]) then
Inc(Source, 4)
else
break;
$F1..$F3:
if (Source+3 <EndPtr)
and (Byte((Source+1)^) in [$80..$BF])
and (Byte((Source+2)^) in [$80..$BF])
and (Byte((Source+3)^) in [$80..$BF]) then
Inc(Source, 4)
else
break;
$F4:
if (Source+3 <EndPtr)
and (Byte((Source+1)^) in [$80..$8F])
and (Byte((Source+2)^) in [$80..$BF])
and (Byte((Source+3)^) in [$80..$BF]) then
Inc(Source, 4)
else
break;
else
break;
end;
end;
if Source = EndPtr then Result := etUTF8
else Result := etANSI;
end;
function ZDetectUTF8Encoding(Source: PAnsiChar; Len: NativeUInt): TEncodeType;
var
c: Byte;
EndPtr: PAnsichar;
begin
Result := etUSASCII;
if (Source = nil) or (Len = 0) then Exit;
EndPtr := Source + Len -SizeOf(Cardinal);
// skip leading US-ASCII part.
while Source <= EndPtr do //Check next quad
begin
if PCardinal(Source)^ and $80808080<>0 then Break; //break on first non USASCII sequence
inc(Source, SizeOf(Cardinal));
end;
Inc(EndPtr, SizeOf(Cardinal));
while Source <EndPtr do //Check bytes
begin
if Byte(Source^) >= $80 then break; //break on first non USASCII sequence
inc(Source);
end;
// If all character is US-ASCII, done.
if Source = EndPtr then exit;
while Source <EndPtr do
begin
c := Byte(Source^);
case c of
$00..$7F: //Ascii7
if (EndPtr-Source> SizeOf(PCardinal)) and (PCardinal(Source)^ and $80808080 = 0) then //Check quad block ASCII again
inc(Source, SizeOf(PCardinal)) //bug
else
Inc(Source);
$C2..$DF: // non-overlong 2-byte
if (Source+1 <EndPtr)
and (Byte((Source+1)^) in [$80..$BF]) then
Inc(Source, 2)
else
break;
$E0: // excluding overlongs
if (Source+2 <EndPtr)
and (Byte((Source+1)^) in [$A0..$BF])
and (Byte((Source+2)^) in [$80..$BF]) then
Inc(Source, 3)
else
break;
$E1..$EF: // straight 3-byte & excluding surrogates
if (Source+2 <EndPtr)
and (Byte((Source+1)^) in [$80..$BF])
and (Byte((Source+2)^) in [$80..$BF]) then
Inc(Source, 3)
else
break;
$F0: // planes 1-3
if (Source+3 <EndPtr)
and (Byte((Source+1)^) in [$90..$BF])
and (Byte((Source+2)^) in [$80..$BF])
and (Byte((Source+3)^) in [$80..$BF]) then
Inc(Source, 4)
else
break;
$F1..$F3:
if (Source+3 <EndPtr)
and (Byte((Source+1)^) in [$80..$BF])
and (Byte((Source+2)^) in [$80..$BF])
and (Byte((Source+3)^) in [$80..$BF]) then
Inc(Source, 4)
else
break;
$F4:
if (Source+3 <EndPtr)
and (Byte((Source+1)^) in [$80..$8F])
and (Byte((Source+2)^) in [$80..$BF])
and (Byte((Source+3)^) in [$80..$BF]) then
Inc(Source, 4)
else
break;
else
break;
end;
end;
if Source = EndPtr then Result := etUTF8
else Result := etANSI;
end;