鎴戝湪瑙e喅涔辯爜涓婇潰瀹為檯璧頒簡涓嶅皯寮礬錛屽仛浜嗗緢澶氬疄楠岋紝鏌ヤ簡寰堝璧勬枡銆傚湪榪欓噷鍋氫笅絎旇錛屽笇鏈涘悗鏉ヨ呭彲浠ユ槑鐧斤紝灝戣蛋浜涘集璺?/span>
浠庢渶鐔熸倝鐨勪袱縐嶅瓧絎︾紪鐮佽璧?br />
闄や簡涓浜涙棫鐨勩佹病鏈夎冭檻鍒板吋瀹規х殑緗戦〉榪樺湪鐢╣bk鍋氱紪鐮佸錛屽ぇ閮ㄥ垎鐨勭綉欏甸兘宸茬粡鐢╱tf-8鍋氱紪鐮佷簡銆備絾鏄渶浠や漢澶寸柤鐨勬槸錛寃indows鐨勬帶鍒跺彴鏄緢涓嶅ソ鏄劇ずutf-8鐨勩傛湁鏄庡悰涓烘垜澶++鍐欎簡涓や釜鍑芥暟錛屾槸姝g‘鐨勩佸ソ鐢ㄧ殑錛堥櫎浜嗙敤std::string鍋氳繑鍥炲艱鎴戠瓑鏁堢巼鍏氭湁鐐硅寰椾笉鐖戒箣澶?#8230;…榪樻槸鎸烘柟渚跨殑錛?/strike>.
#include <string>
#include <windows.h>
using std::string;
//gbk 杞?nbsp;utf8
string GBKToUTF8(const string& strGBK)
{
string strOutUTF8 = "";
WCHAR * str1;
int n = MultiByteToWideChar(CP_ACP, 0, strGBK.c_str(), -1, NULL, 0);
str1 = new WCHAR[n];
MultiByteToWideChar(CP_ACP, 0, strGBK.c_str(), -1, str1, n);
n = WideCharToMultiByte(CP_UTF8, 0, str1, -1, NULL, 0, NULL, NULL);
char * str2 = new char[n];
WideCharToMultiByte(CP_UTF8, 0, str1, -1, str2, n, NULL, NULL);
strOutUTF8 = str2;
delete[]str1;
str1 = NULL;
delete[]str2;
str2 = NULL;
return strOutUTF8;
}
//utf-8 杞?nbsp;gbk
string UTF8ToGBK(const string& strUTF8)
{
int len = MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, NULL, 0);
unsigned short * wszGBK = new unsigned short[len + 1];
memset(wszGBK, 0, len * 2 + 2);
MultiByteToWideChar(CP_UTF8, 0, (LPCTSTR)strUTF8.c_str(), -1, wszGBK, len);
len = WideCharToMultiByte(CP_ACP, 0, wszGBK, -1, NULL, 0, NULL, NULL);
char *szGBK = new char[len + 1];
memset(szGBK, 0, len + 1);
WideCharToMultiByte(CP_ACP,0, wszGBK, -1, szGBK, len, NULL, NULL);
//strUTF8 = szGBK;
std::string strTemp(szGBK);
delete[]szGBK;
delete[]wszGBK;
return strTemp;
}
榪欑帺鎰忓効涓嶈法騫沖彴錛屽洜涓哄畠鐢ㄥ埌浜唚indows api銆傛垜涔嬫墍浠ユ妸瀹冩斁鍒拌法騫沖彴緙栫▼涓婇潰鏉ワ紝鏄洜涓哄瓧絎︾紪鐮佽繖涓滆タ鍙湁鍒拌法騫沖彴鐨勬椂鍊欐墠鏄懼緱鍧戠埞銆?/span>
鎺ョ潃鎴戞槸涓嶆槸瑕佷粙緇嶉偅淇╁嚱鏁頒竴涓嬶紵
int MultiByteToWideChar(
_In_ UINT CodePage, /*浠g爜欏墊槸Windows涓嬪瓧絎︾紪鐮佺殑鍙硶錛実bk鏄?36錛寀tf-8鏄?5001錛孋P_ACP鏄疉NSI*/
_In_ DWORD dwFlags, /*閫夐」鏍囧織錛岃漿鎹㈢被鍨嬶紝璁?灝辮浜?/span>*/
_In_ LPCSTR lpMultiByteStr, /*澶氬瓧鑺傚瓧絎︿覆*/
_In_ int cbMultiByte, /*瀛楃涓茶澶勭悊鐨勯暱搴︼紝濡傛灉鏄?1鍑芥暟灝變細澶勭悊鏁翠釜瀛楃涓?/span>*/
_Out_opt_ LPWSTR lpWideCharStr, /*杈撳嚭鐨勫瀛楃涓茬紦瀛橈紝濡傛灉涓虹┖灝辮繑鍥為渶瑕佺殑瀹藉瓧絎︿覆闀垮害*/
_In_ int cchWideChar /*瀹藉瓧絎︿覆緙撳瓨鐨勯暱搴︼紝褰撶劧濡傛灉瀹藉瓧絎︿覆涓虹┖錛岃繖涓0灝卞彲浠ヤ簡*/
);
int WideCharToMultiByte(
_In_ UINT CodePage,
_In_ DWORD dwFlags,
_In_ LPCWSTR lpWideCharStr,
_In_ int cchWideChar,
_Out_opt_ LPSTR lpMultiByteStr,
_In_ int cbMultiByte, /*鍓嶉潰鐨勫熀鏈笌MultiByteToWideChar閮界浉鍚岋紝灝變笉瑙i噴浜?/span>*/
_In_opt_ LPCSTR lpDefaultChar, /*濉?鍗沖彲*/
_Out_opt_ LPBOOL lpUsedDefaultChar /*濉?鍗沖彲*/
);
榪欎袱涓嚱鏁板垎鍒槸灝嗗瀛楄妭瀛楃涓茶漿鎹負瀹藉瓧絎﹀瓧絎︿覆 鍜?灝嗗瀛楃瀛楃涓茶漿鎹負澶氬瓧鑺傚瓧絎︿覆錛堝湪姝ゅ鏅曞掔殑绔ラ瀷浠垜娌℃湁瀵逛笉璧蜂綘浠?#8230;…鏄疢$閭e浼欏涓嶈搗浣犱滑錛夈傛垜鏃╁氨璇磋繃Windows API 鐨勭晫闈笉鍙嬪ソ錛岃繖涔堝涓嶇煡閬撳共鍢涘悧鐢ㄧ殑鍙傛暟錛屽叏閮ㄥ~0灝卞浜嗐傝鏄痠conv()錛屽畠璨屼技鍙湁4涓弬鏁幫紝榪欐墠鏄ソ鐨勬鏍楓?/span>
瀹藉瓧絎︼紵澶氬瓧鑺傦紵
榪欐槸Windows緇欏畠浠搗鐨勫悕瀛楋紝璁╀漢鎽鎬笉鐫澶磋剳銆?/span>
瀹藉瓧絎︿箣鎵浠ュ彨鍋氬瀛楃錛屾槸鍥犱負瀹冩槸涓涓涓鐐圭殑瀛楃銆傞偅浠涔堟槸鐭瓧絎?#8230;…灝辨槸ascii浜嗭紝1涓瓧鑺?涓瓧絎︾粷瀵瑰鐭紝鑰屼笖鍙兘琛ㄧず256涓タ嬈у瓧絎︺傚瀛楃鍛紝鏄?涓瓧鑺?涓瓧絎︺傚涓鐐癸紝浣嗚繕鏄彲浠ヨ瘑鍒埌涓涓瓧絎︽槸鍝噷鐨勩傝屽瀛楄妭鍛紝灝辨槸瀹冨湪璁$畻鏈洪噷琛ㄧず鎴愬涓瓧鑺傦紝浣嗘槸娌℃湁鍔炴硶璇嗗埆閭i噷鍒伴偅閲屾槸涓涓瓧絎︺?/span>
鎴戜笉鍠滄榪欎袱涓嚱鏁扮殑鍛藉悕銆傚鏋滄寜鐓ython鐨勫懡鍚嶏紝MultiByteToWideChar 搴旇鍙?decode(瑙g爜)錛學ideCharToMultiByte 搴旇鍙?encode(緙栫爜)銆?/span>
鎵浠ュ憿錛?/h3>
濡備綘鎵瑙侊紝澶氬瓧鑺傛棤娉曞噯紜瘑鍒瓧絎︾殑闀垮害錛屽鐞嗚搗鏉ュ氨浼氬緢楹葷儲銆傝屽瀛楃澶у鏃跺欒櫧鐒舵瘮澶氬瓧鑺傚鑰楄垂涓鐐圭┖闂達紝浣嗘槸澶勭悊璧鋒潵鏂逛究銆傛瘮濡傛鍒欒〃杈懼紡澶勭悊錛屽紩鎿庢槸鍩轟簬瀛楃鍘誨尮閰嶇殑錛屽瀛楃鍙互涓や釜瀛楄妭涓や釜瀛楄妭璺崇潃鍖歸厤錛岃屽瀛楄妭灝變細鍖歸厤閿欒銆?/span>
姣斿鏈変竴涓瘝“紼嬪簭”=0xB3CCD0F2(gbk)錛屾垜鎯沖尮閰?#8220;緇?/span>”=0xCCD0(gbk)錛屾鍒欏簱浼氭浛鎴戞妸涓棿閭d袱涓瓧鑺傚尮閰嶄簡銆傜敤鍦–閲岀敤wchar_t錛孋++閲岀敤std::wstring錛屾垜浠彲浠ュ緢鍑嗙‘鐨勶紝鏃犻敊璇湴鍖歸厤鍒版垜浠兂瑕佺殑瀛愪覆錛屽洜涓哄紩鎿庡湪榪唬鐨勬椂鍊欐槸閫愬瓧錛堣屼笉鏄愬瓧鑺傦級榪涜姣旇緝鐨勩?/span>
1 >>> str1 = "緇?/span>"
2 >>> str2 = "紼嬪簭"
3 >>> print re.findall(str1, str2)
4 ['\xcc\xd0']
5 >>> print re.findall(str1.decode("gbk"), str2.decode("gbk"))
6 []
鎵浠ュ湪澶勭悊瀛楃涓茬殑鏃跺欙紝浣嗗嚒瑕佸鐞嗕腑鏂囷紝瑕佸厛鎶婄敤鎴風粰鐨勫瓧絎︿覆瑙g爜鎴怳nicode銆傚鐞嗗畬涔嬪悗鏄劇ず鍑烘潵鎴栬呬繚瀛橈紝鍐嶇紪鐮佹垚闇瑕佺殑charset銆?/span>
Appendix
鍦ㄤ笉鍚岀殑鍦版柟鐢ㄤ笉鍚岀殑緙栫爜錛?/em>
- 緗戠粶鏂囨湰錛堝緗戦〉錛変紶杈撲竴鑸敤utf-8錛屽洜涓烘湁灝戦噺涓枃錛岃屽ぇ閮ㄥ垎鏄嫳鏂囥?/em>
- 鍦ㄤ繚瀛樹負鏈湴鏂囦歡鐨勬椂鍊欙紝搴旇淇濆瓨涓篣nicode錛屽洜涓烘湰鍦板瓨鍌ㄨ祫婧愪赴瀵岋紝涓斿彲浠ヨ妭鐪佹椂闂達紝瀹炴椂瑙g爜姣曠珶涔熸槸O(N^2)鍟娿?/em>
- 鏄劇ず鍑烘潵搴旇鐢ㄧ郴緇熺殑緙栫爜錛屼腑鏂嘩indows涓篻bk錛岀箒浣揥indows涓築ig5錛孡inux涓寰嬩負UTF-8銆?/em>
- 婧愪唬鐮侀噷鐨勫皯閲忎腑鏂囦覆灝介噺鐢?em style="color: #ff6600;">"\x????\x????"鏉ヨ〃紺猴紝濡傛灉鏈夊ぇ閲忎腑鏂囧緩璁敤gettext鎴栬呰祫婧愪箣綾葷殑浠ュ鎸傜殑鏂瑰紡璇誨叆銆?/em>
- Qt鍐呴儴浣跨敤Unicode錛屾墍浠ョ紪鍐橯t搴旂敤鏃舵樉紺烘枃瀛楃洿鎺ヤ紶閫掑瀛楃涓插嵆鍙?/em>
- NTFS鐨勬枃浠跺悕銆佽礬寰勯兘鏄敤
GBKUTF16LE緙栫爜鐨勶紝鎵浠ュ鏋淲indows涓嬬敤鎴瘋緭鍏ョ殑鏄礬寰勫氨鏃犻渶瑙g爜浜嗐?/em>

]]>