最近學習了下編碼
以下地址可以很好的學習到相關的知識
http://dev.csdn.net/develop/article/69/69883.shtm
http://dev.csdn.net/develop/article/72/72888.shtm
其中講了UTF8的編碼
當要表示的內容是 7位 的時候就用一個字節:0******* 第一個0為標志位,剩下的空間正好可以表示ASCII 0-127 的內容。
當要表示的內容在 8 到 11 位的時候就用兩個字節:110***** 10****** 第一個字節的110和第二個字節的10為標志位。
當要表示的內容在 12 到 16 位的時候就用三個字節:1110***** 10****** 10****** 和上面一樣,第一個字節的1110和第二、三個字節的10都是標志位,剩下的空間正好可以表示漢字。
以此類推:
四個字節:11110**** 10****** 10****** 10******
五個字節:111110*** 10****** 10****** 10****** 10******
六個字節:1111110** 10****** 10****** 10****** 10****** 10******
.............................................
我自己寫了轉換的代碼如下
UCS和UTF8相互轉換
int UCS2UTF8(wchar_t* pUCS,unsigned char* pUTF8)
{
int UCSlen = 0, UTF8len = 0, i;
unsigned char* pTempUTF8 = NULL;
UCSlen = wcslen(pUCS);
if(pUCS == NULL || pUTF8 == NULL)
return -1;
pTempUTF8 = pUTF8;
for(i = 0; i < UCSlen; i++)
{
if(pUCS[i] <= 0x007F)//1 byte 0xxxxxxx
{
*(pTempUTF8++) = LOBYTE(pUCS[i]);
UTF8len++;
}
else if(pUCS[i] <=0x07FF)//2 bytes 110xxxxx 10xxxxxx
{
*(pTempUTF8++) = HIBYTE(pUCS[i] << 2) & 0x3F | 0xC0;
*(pTempUTF8++) = LOBYTE(pUCS[i] & 0x3f) | 0x80;
UTF8len += 2;
}
else//3 bytes 1110xxxx 10xxxxxx 10xxxxxx
{
*(pTempUTF8++) = HIBYTE(pUCS[i] >> 4) | 0xe0;
*(pTempUTF8++) = HIBYTE(pUCS[i] << 2) & 0x3F | 0x80;
*(pTempUTF8++) = LOBYTE(pUCS[i]) & 0x3F | 0x80;
UTF8len += 3;
}
}
return UTF8len;
}
int UTF82UCS(unsigned char *pUTF8, wchar_t *pUCS)
{
int UCSlen = 0, i;
unsigned char *pTempUCS = NULL;
unsigned char *pTempUTF8 = NULL;
if(pUCS == NULL || pUTF8 == NULL)
return -1;
UCSlen = MultiByteToWideChar(CP_UTF8,0,pUTF8,-1,NULL,0);
UCSlen--;
pTempUCS = (char*)pUCS;
pTempUTF8 = pUTF8;
for(i = 0; i < UCSlen; i++)
{
if((*pTempUTF8) <= 0x7F) //1 byte
{
*(pTempUCS + 1) = 0x00;
*pTempUCS = *(pTempUTF8++);
pTempUCS += 2;
}
else if((*pTempUTF8) >= 0xC0 && (*pTempUTF8) <= 0xDF)//2 bytes
{
*(pTempUCS + 1) = ((*pTempUTF8) >> 2) & 0x07;
*pTempUCS = ((*pTempUTF8) << 6) | (*(pTempUTF8 + 1) & 0x3F);
pTempUTF8 += 2;
pTempUCS += 2;
}
else//3 bytes
{
*(pTempUCS + 1) = ((*pTempUTF8) << 4) | ((*(pTempUTF8 + 1) >> 2) & 0x0F);
pTempUTF8++;
*pTempUCS = ((*pTempUTF8) << 6) | (*(pTempUTF8+1) & 0x3F);
pTempUTF8 += 2;
pTempUCS += 2;
}
}
return UCSlen;
}
其他編碼和UCS轉換
int ToUCS(unsigned char *p,wchar_t* pUCS,int codepage)
{
int len = 0;
if(pUCS == NULL || p == NULL)
return -1;
len = MultiByteToWideChar(codepage,0,p,-1,NULL,0);
MultiByteToWideChar(codepage,0,p,-1,pUCS,len);
return len;
}
int UCSTo(wchar_t* pUCS,char *p,int codepage)
{
int len = 0;
if(pUCS == NULL || pBIG5 == NULL)
return -1;
len = WideCharToMultiByte(codepage,0,pUCS,-1,NULL,0,NULL,NULL);
WideCharToMultiByte(codepage,0,pUCS,-1,p,len,NULL,NULL);
len--;
return len;
}
這里的codepage在MSDN定義如下
Bit | Code page | Description |
ANSI | | |
0 | 1252 | Latin 1 |
1 | 1250 | Latin 2: Eastern Europe |
2 | 1251 | Cyrillic |
3 | 1253 | Greek |
4 | 1254 | Turkish |
5 | 1255 | Hebrew |
6 | 1256 | Arabic |
7 | 1257 | Baltic |
8 | 1258 | VietNam |
9 - 15 | | Reserved for ANSI |
ANSI and OEM | | |
16 | 874 | Thai |
17 | 932 | Japanese, Shift-JIS |
18 | 936 | Chinese: Simplified chars—PRC and Singapore |
19 | 949 | Korean Unified Hangeul Code (Hangeul TongHabHyung Code) |
20 | 950 | Chinese: Traditional chars—Hong Kong SAR, PRC and Taiwan |
21 | 1361 | Korean (Johab) |
22 - 29 | | Reserved for alternate ANSI and OEM |
30 - 31 | | Reserved by system. |
OEM | | |
32 - 46 | | Reserved for OEM |
47 | 1258 | VietNam |
48 | 869 | IBM Greek |
49 | 866 | MS-DOS Russian |
50 | 865 | MS-DOS Nordic |
51 | 864 | Arabic |
52 | 863 | MS-DOS Canadian French |
53 | 862 | Hebrew |
54 | 861 | MS-DOS Icelandic |
55 | 860 | MS-DOS Portuguese |
56 | 857 | IBM Turkish |
57 | 855 | IBM Cyrillic; primarily Russian |
58 | 852 | Latin 2 |
59 | 775 | Baltic |
60 | 737 | Greek; former 437 G |
61 | 708 | Arabic; ASMO 708 |
62 | 850 | Western European/Latin 1 |
63 | 437 | US |