C语言 检测一个文本文件的编码是否为utf-8

/*
    filename: isutf8.c
    Time:     2016-12-9 20:27
    Author:   Albert Wang
    email:    albertofwb@gmail.com
    Function: detect whether a text file's encoding is utf-8 format
*/

#include <stdio.h>
#include <stdlib.h>  // exit()
#include <io.h>  // _access() detect a file's existence

#define True  1
#define False 0

typedef char Bool;
typedef unsigned char Uchar;

int DumpFromFile(const char *FileName, char *buf, size_t FileSize)
{
    FILE     *fp;

    if ((fp = fopen(FileName, "rb")) == NULL)
    {
        return -1;
    }

    fread(buf, 1, FileSize, fp);
    fclose(fp);

    return 0;
}


int GetFileSize(const char *FileName, size_t *FileSize)
{
    FILE *fp;

    if ((fp = fopen(FileName, "rb")) == NULL)
    {
        return -1;
    }

    fseek(fp, 0, SEEK_END);
    *FileSize = ftell(fp);

    fclose(fp);

    return 0;
}

Bool IsUtf8(const char* FileName)
{
    FILE *fp = NULL;
    size_t FileSize = 0;
    char *fileBuf = NULL;


    GetFileSize(FileName, &FileSize);
    fileBuf = (char *)malloc(FileSize);
    DumpFromFile(FileName, fileBuf, FileSize);

    size_t i = 0;
    Bool ret = True;

    for ( ; ret && (i < FileSize); i++)
    {
        Uchar hexchar = fileBuf[i];
        // ignore ascii code
        if (!(hexchar & 0x80))
        {
            continue;
        }

        // calculate how many serial "1"
        int   BitOneCount = 0;
        Uchar num = hexchar;
        while (num & 0x80)
        {
            if (num & 0x80)
            {
                BitOneCount += 1;
            }
            num <<= 1;
        }

        BitOneCount -= 1;
        while (BitOneCount > 0)
        {
            i += 1;
            num = fileBuf[i];   // num suppose to be 10xx xxxx
            num >>= 6;            // num = 0000 0010
            if (2 != num)
            {
                ret = False;
                //printf("i = %d num = %d hexchar = 0x%x BitOneCount= %d\n", i, num, hexchar, BitOneCount);
                break;
            }
            BitOneCount -= 1;
        }

    //end for
    }


    free(fileBuf);
    return ret;
}

int main(int argc, char *argv[])
{
    if (argc != 2)
    {
        printf("Usage: %s <FileName>\n", argv[0]);
        exit(1);
    }

    const char* FileName = argv[1];
    char  *result[] = {
        "False", "True"
    };

    if (-1 == _access(FileName, 0))
    {
        printf("%s not exists!\n", FileName);
        exit(1);
    }

    printf("[%s] %s\n", FileName, result[IsUtf8(FileName)]);

    return 0;
}

/*
    参考连接: http://www.ruanyifeng.com/blog/2007/10/ascii_unicode_and_utf-8.html
*/

运行结果

 

使用 winhex 以utf8 的编码查看样本文件:

文件

 

posted @ 2016-12-09 20:50  SurfUniverse  阅读(2323)  评论(0编辑  收藏  举报