On character encoding of text files

Keywords: encoding ascii

In win10 system, there are several character codes for TXT files, ASCII, ANSI, UTF-8, UTF-8 with BOM, UTF-16LE, UTF-16BE.
In general, UTF-8 encoding is used by default. This encoding is universal and can express any language, but it also has the disadvantage that the encoding length is not equal. Here are the characteristics of these codes:
(1) ASCII and ANSI encoding: these two encodings are compatible with each other. If the highest bit of byte is 0 (0-7F), and the binary shape is like 0XXX XXXX, the expression of ASCII characters is the content of USA-ASCII character table. If the highest bit of byte is 1 (80-FE), it is ANSI (GBK) code. Note that at this time, two bytes represent a Chinese character, that is to say, the byte with the highest bit of both bytes is 1, which represents a Chinese character. Binary forms such as 1XXX XXXX, 1XXX XXXX. Therefore, ANSI character encoding is compatible with ASCII encoding.
Under the ANSI code, if the text characters are all Western characters, it can also be considered as ASCII code. When a byte string with a continuous height of 1 appears, it can be judged as ANSI or GBK code. GBK code must express a Chinese character in two bytes with 1 high bit. ANSI is a byte to express a character. The byte height can be 1 or 0.
(2) utf-8 encoding. utf-8 is a multibyte coded character set. When it represents a Unicode character, it can be one or more bytes. That is, when the text is all ASCII characters, utf-8 is consistent with ASCII (utf-8 is downward compatible with ASCII). Up to six bytes represent one character, and the utf-8 byte stream is as follows:
1 byte: 0xxxxxx
2 bytes: 110xxxxx 10xxxxx
3 bytes: 1110xxxx 10xxxxx 10xxxxx, general Chinese characters are expressed in these 3 bytes
4 bytes: 11110xxx 10xxxxx 10xxxxx 10xxxxx
5 bytes: 111110xx 10xxxxx 10xxxxx 10xxxxx 10xxxxx
Note that in UTF-8 encoding, multiple lengths are mixed, that is, there may be 1, 2, 3, equal byte length characters in a word. Therefore, it is difficult to judge.
(3) UTF-8 with BOM, the header of this text file has three leading flag bytes 0xEF, 0xBB,0xBF. By judging this flag, we can judge that this text file is UTF-8 code.
(4) UTF-16LE, the byte stream is little endian, which is the standard UNICODE encoded text, which is double byte equal length encoding. The header of the text file has the leading log byte 0xFF 0xFE. By judging this flag, it can be determined that the text file is UTF-16LE encoded.
(5) UTF-16BE, byte stream is big endian, which is another UNICODE encoded text, which is double byte equal length encoding. The header of the text file has the leading log byte 0xFE 0xFF. By judging this flag, it can be determined that the text file is UTF-16BE encoded.

Through the above analysis, ASCII, ANSI, UTF-8 text files need to correctly analyze the byte stream of the string to determine the encoding scheme. The method is complex, especially UTF-8 encoding.
UTF-8, UTF-16LE and UTF-16BE with BOM are three codes. Only the header byte of the text file needs to be analyzed to determine the coding scheme. The method is simple.
Here is my analysis code:
 

type
  TTextFormat=(tfUSAASCII,tfANSI,tfUtf_8,tfUTF_16LE,tfUTF_16BE,tfUtf_8BOM);
    // Return text file encoding type, sText return flag word
   function GetTextFormat(const FileName: string;  var sText:string):TTextFormat;

   function DetectUTF8Encoding2(const ss: array of byte; var Bn:Integer): TEncodeType;

const
  TextFormatFlag:array[tfUTF_16LE..tfUtf_8BOM] of LongWord=($FFFE,$FEFF,$EFBBBF);

  TextFormatFlagT:array[TTextFormat] of string=('ASCII','ANSI(GBK)','UTF-8','UTF-16LE','UTF-16BE','UTF-8BOM');


implementation

{$R *.dfm}

{ TForm1 }

function TForm1.GetTextFormat(const FileName: string;   var sText: string): TTextFormat;
var

  fTxtStream: TFileStream; //File stream
  w:Word;
  Count,i,NC:Integer;

  context:array[0..127] of byte;
  tt:array[TEncodeType] of boolean;

  aap:boolean;  //Mode discrimination
begin


  sText:='';       aap:=False;
  tt[etUTF8]:=False;  tt[etANSI]:=False;  tt[etUSASCII]:=False;

  result:=tfAnsi;
  //   NC:=128;


  fTxtStream := TFileStream.Create(FileName, fmOpenRead or fmShareDenyNone);
  try
    NC:=fTxtStream.Size ;  //

     count:=fTxtStream.Read(context, 128); //No more than 128 bytes are read at a time, and the actual number of bytes read is returned

    if not(aap) then
    begin
       if Count>=2 then  //Only the header of a file longer than 2 bytes is judged.
       begin

         w:=(word(context[0]) shl 8) or context[1];    //  Combine into word
            case w of
              $FFFE:result:=tfUTF_16LE;
              $FEFF:result:=tfUTF_16BE;
              $EFBB: begin if Count>=3 then
                             if context[2]=$BF then  result:=tfUtf_8BOM
                               else  aap:=True  //If not, further judgment is needed (USASCII, UTF8, ANSI code)
                     end;
            else
             aap:=True;; //If not, further judgment is needed (USASCII, UTF8, ANSI code)
            end;


       end  else aap:=True ; //If not, further judgment is needed (USASCII, UTF8, ANSI code)


      
             //Format glyph
       if Result=tfUtf_8BOM
             then  sText:='Leading glyph-'+RightStr(inttohex(TextFormatFlag[Result]),6) ;

       if (Result=tfUTF_16LE) or (Result=tfUTF_16BE)
             then  sText:='Leading glyph-'+RightStr(inttohex(TextFormatFlag[Result]),4);

    end;

  //Determine the encoding type of Txt file (usaci, utf8, ANSI encoding
 //If not, further judgment is needed (USASCII, UTF8, ANSI code)
    if aap then
    begin

                //Probe encoding
               i:=Count;
                case DetectUTF8Encoding2(context,i) of
                  etUTF8:begin
                           tt[etUTF8]:=True; //If UTF? 8 is detected to jump out of the loop
                     //      lbl1.Caption:=TextFormatFlagT[tfUtf_8];
                           //Break ;
                         end;
                  etANSI: begin   tt[etANSI]:=True;  //Continue to explore
                     //      lbl3.Caption:=TextFormatFlagT[tfANSI];
                          end;
                  etUSASCII:begin  tt[etUSASCII]:=True; //Continue to explore
                     //        lbl4.Caption:=TextFormatFlagT[tfUSAASCII];
                             end;
                end;

            if tt[etUTF8] then result:=tfUtf_8
               else if tt[etANSI] then  result:=tfANSI
                    else if tt[etUSASCII] then result:=tfUSAASCII;


            sText:='No leading flag word, encoded byte length-'+IntToStr(i);

    end;

  finally
        FreeAndNil(fTxtStream);
  end;





end;



function TForm1.DetectUTF8Encoding2(const ss: array of byte; var Bn:Integer): TEncodeType;
var

  i,sCount:Integer;
  sT:array[TEncodeType] of integer;
begin
  Result := etUSASCII;

  sT[etUSASCII]:=0;  sT[etANSI]:=0; sT[etUTF8]:=0; //Initialization

  sCount:=bn;  //Byte actual length

  bn:=0;  i:=0;   //Initialization




  if sCount<=Length(ss) then
  while i<sCount do
  begin

    if (ss[i] in [$00..$7F]) then   sT[etUSASCII]:=1 ;   //USAASCII mark

    if (ss[i] in [$80..$FD]) and ((i+1)< sCount)
       then  if (ss[i+1] in [$80..$FD])
                then sT[etANSI]:=2 //Two byte ANSI, GBK characters
                else if sT[etANSI]=0 then sT[etANSI]:=1  ;  //Single byte ANSI characters


    if (ss[i] in [$C0..$DF]) and ((i+1) < sCount)
       then if (ss[i+1] in [$80..$BF])
               then  if sT[etUTF8]<2 then sT[etUTF8]:=2 ; //Is two byte UTF8

    if (ss[i] in [$E0..$EF]) and ((i+2) < sCount)
       then if ((ss[i+1] and ss[i+2]) in [$80..$BF]) //Whether the last two bytes start with 10·
            then if sT[etUTF8]<3 then sT[etUTF8]:=3; //Is three byte UTF8

    if (ss[i] in [$F0..$F7]) and ((i+3) < sCount)
       then if ((ss[i+1] and ss[i+2] and ss[i+3]) in [$80..$BF]) //Whether the last three bytes start with 10
            then if sT[etUTF8]<4 then sT[etUTF8]:=4  ;//Is four byte UTF8

   // lbl4.Caption:=lbl4.Caption+IntTostr(sT[etANSI])  ;
    inc(i); //Read next byte

  end;

  if sT[etUTF8]>=2
  then begin
    result:=etUTF8 ; bn:=sT[etUTF8];
       end
  else if sT[etANSI]>=1
       then    begin
                  result:=etANSI ; bn:=sT[etANSI];
               end
       else  if sT[etUSASCII]>=1
             then  begin
                      result:=etUSASCII;  bn:=sT[etUSASCII];
                   end;

 

Published 14 original articles, won praise 11, visited 20000+
Private letter follow

Posted by devarishi on Mon, 16 Mar 2020 05:46:28 -0700