utf-8文件可以選選擇是否加Byte Order Mark (BOM),先檢查文件前端有沒有BOM ($EF $BB $BF),有的就是utf-8。
至於沒有BOM的文件,可經測試字符的編排模式去判斷是否utf-8。範例:
CODE
function isUTF8(s: string): boolean;
var
i: integer;
begin
result := true;
i := 1;
repeat
if (Ord(s) and $80) = 0 then inc(i) //單字節英文字符
else
if ((Ord(s[i+1]) and $E0) = $C0) then //是雙字節組合的首字節
begin //2-byte code
if (Ord(s[i+1]) and $C0) = $80 then //第二字節在有效範圍
i := i + 2
else //並非UTF8字串
begin
result := false;
break;
end;
end
else if ((Ord(s) and $F0) = $E0) then //是三字節組合的首字節
begin
if ((Ord(s[i+1]) and $C0) = $80) and ((Ord(s[i+2]) and $C0) = $80) then
i := i + 3 //第二及第三字節皆有效
else //並非UTF8字串
begin
result := false;
break;
end;
end
else //並非UTF8字串
begin
result := false;
break;
end;
if i > 3000 then break; //檢查字串首3000字節已足夠
until i >= length(s);
end;