Linux 平台和 Windows平台下 Unicode与UTF-8互转


unsigned char * make_utf8_string(const wchar_t *unicode)
    int size = 0, index = 0, out_index = 0;
    unsigned char *out;
    unsigned short c;

    /* first calculate the size of the target string */
    c = unicode[index++];
        if(c < 0x0080) 
            size += 1;
        else if(c < 0x0800) 
            size += 2;
            size += 3;

        c = unicode[index++];

    out = (unsigned char*)malloc(size + 1);
    if (out == NULL)
        return NULL;

    index = 0;

    c = unicode[index++];
        if(c < 0x080) 
            out[out_index++] = (unsigned char)c;
        else if(c < 0x800) 
            out[out_index++] = 0xc0 | (c >> 6);
            out[out_index++] = 0x80 | (c & 0x3f);
            out[out_index++] = 0xe0 | (c >> 12);
            out[out_index++] = 0x80 | ((c >> 6) & 0x3f);
            out[out_index++] = 0x80 | (c & 0x3f);
        c = unicode[index++];

    out[out_index] = 0x00;

    return out;

wchar_t * make_unicode_string(const unsigned char *utf8)
    int size = 0, index = 0, out_index = 0;
    wchar_t *out;
    unsigned char c;

    /* first calculate the size of the target string */
    c = utf8[index++];
        if((c & 0x80) == 0) 
            index += 0;
        else if((c & 0xe0) == 0xe0) 
            index += 2;
            index += 1;

        size += 1;
        c = utf8[index++];

    out = (wchar_t*)malloc((size + 1) * sizeof(wchar_t));
    if (out == NULL)
        return NULL;

    index = 0;

    c = utf8[index++];
        if((c & 0x80) == 0) 
            out[out_index++] = c;
        else if((c & 0xe0) == 0xe0) 
            out[out_index] = (c & 0x1F) << 12;
            c = utf8[index++];
            out[out_index] |= (c & 0x3F) << 6;
            c = utf8[index++];
            out[out_index++] |= (c & 0x3F);
            out[out_index] = (c & 0x3F) << 6;
            c = utf8[index++];
            out[out_index++] |= (c & 0x3F);

        c = utf8[index++];

    out[out_index] = 0;

    return out;

int StrUtil::utf8_encode(const char *from, char **to)
    wchar_t *unicode;
    int wchars, err;

    wchars = ::MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from,
        strlen(from), NULL, 0);

    if (wchars == 0)
        fprintf(stderr, "Unicode translation error %d
", GetLastError());
        return -1;

    unicode = (wchar_t*)calloc(wchars + 1, sizeof(unsigned short));
    if(unicode == NULL)
        fprintf(stderr, "Out of memory processing string to UTF8
        return -1;

    err = ::MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from,
        strlen(from), unicode, wchars);
    if(err != wchars)
        fprintf(stderr, "Unicode encode error %d
", GetLastError());
        return -1;

    /* On NT-based windows systems, we could use WideCharToMultiByte(), but
    * MS doesn't actually have a consistent API across win32.
    *to = (char *)make_utf8_string(unicode);

    return 0;

int StrUtil::utf8_decode(const char *from, char **to)
    wchar_t *unicode;
    int chars, err;

    /* On NT-based windows systems, we could use MultiByteToWideChar(CP_UTF8), but
    * MS doesn't actually have a consistent API across win32.
    unicode = make_unicode_string((unsigned char*)from);
    if(unicode == NULL)
        fprintf(stderr, "Out of memory processing string from UTF8 to UNICODE16
        return -1;

    chars = ::WideCharToMultiByte(GetConsoleCP(), WC_COMPOSITECHECK, unicode,
        -1, NULL, 0, NULL, NULL);

    if(chars == 0)
        fprintf(stderr, "Unicode translation error %d
", GetLastError());
        return -1;

    *to = (char *)calloc(chars + 1, sizeof(unsigned char));
    if(*to == NULL)
        fprintf(stderr, "Out of memory processing string to local charset
        return -1;

    err = ::WideCharToMultiByte(GetConsoleCP(), WC_COMPOSITECHECK, unicode,
        -1, *to, chars, NULL, NULL);
    if(err != chars)
        fprintf(stderr, "Unicode decode error %d
", GetLastError());
        *to = NULL;
        return -1;

    return 0;

Linux 平台:

unsigned char * make_utf8_string(const wchar_t *unicode)
    int size = 0, index = 0, out_index = 0;
    unsigned char *out;
    unsigned short c;

    /* first calculate the size of the target string */
    c = unicode[index++];
        if(c < 0x0080)
            size += 1;
        else if(c < 0x0800)
            size += 2;
            size += 3;

        c = unicode[index++];

    out = (unsigned char*)malloc(size + 1);
    if (out == NULL)
        return NULL;

    index = 0;

    c = unicode[index++];
        if(c < 0x080)
            out[out_index++] = (unsigned char)c;
        else if(c < 0x800)
            out[out_index++] = 0xc0 | (c >> 6);
            out[out_index++] = 0x80 | (c & 0x3f);
            out[out_index++] = 0xe0 | (c >> 12);
            out[out_index++] = 0x80 | ((c >> 6) & 0x3f);
            out[out_index++] = 0x80 | (c & 0x3f);
        c = unicode[index++];

    out[out_index] = 0x00;

    return out;

wchar_t * make_unicode_string(const unsigned char *utf8)
    int size = 0, index = 0, out_index = 0;
    wchar_t *out;
    unsigned char c;

    /* first calculate the size of the target string */
    c = utf8[index++];
        if((c & 0x80) == 0)
            index += 0;
        else if((c & 0xe0) == 0xe0)
            index += 2;
            index += 1;

        size += 1;
        c = utf8[index++];

    out = (wchar_t*)malloc((size + 1) * sizeof(wchar_t));
    if (out == NULL)
        return NULL;

    index = 0;

    c = utf8[index++];
        if((c & 0x80) == 0)
            out[out_index++] = c;
        else if((c & 0xe0) == 0xe0)
            out[out_index] = (c & 0x1F) << 12;
            c = utf8[index++];
            out[out_index] |= (c & 0x3F) << 6;
            c = utf8[index++];
            out[out_index++] |= (c & 0x3F);
            out[out_index] = (c & 0x3F) << 6;
            c = utf8[index++];
            out[out_index++] |= (c & 0x3F);

        c = utf8[index++];

    out[out_index] = 0;

    return out;
int utf8_encode(const char *from, char **to)
    wchar_t *unicode = NULL;
    int wchars, err;

    wchars = mbstowcs(unicode, from, 0)+1;

    unicode = new wchar_t[wchars];

    err = mbstowcs(unicode, from, wchars);
    if(err < 0)
        delete unicode;
        fprintf(stderr, "Unicode encode error 
        return -1;


    *to = (char *)make_utf8_string(unicode);

    delete unicode;

    return 0;

int utf8_decode(const char *from, char **to)
    wchar_t *unicode = NULL;
    int chars, err;    

   // setlocale(LC_ALL,"zh_CN.GB18030");

    unicode = make_unicode_string((unsigned char*)from);

    chars = wcstombs(*to,unicode, 0)*2 + 1;

    *to = new char[chars];
    memset(*to, 0, chars);

    err = wcstombs(*to, unicode, chars);

delete unicode;
if(err < 0) { fprintf(stderr, "Unicode decode error "); delete *to; *to = NULL; return -1; } return 0; }