【POSIX】利用iconv库将UTF-8字符串转换为UTF-16字符串

打印 上一主题 下一主题

主题 1762|帖子 1762|积分 5286

马上注册,结交更多好友,享用更多功能,让你轻松玩转社区。

您需要 登录 才可以下载或查看,没有账号?立即注册

x
 利用<iconv.h>来进行字符串编码的转换
  1. #include <iconv.h>
  2. #include <iostream>
  3. #include <string.h>
  4. #include <unistd.h>
  5. #include <memory>
  6. #include <fcntl.h>
  7. // 需要链接iconv库
  8. // iconv -l 命令可列出所有支持的格式
  9. // example: iconv将UTF-16转换为UTF-8
  10. // iconv -f UTF-16 -t UTF-8 myfile
  11. // 注意: UTF-16 默认使用的是 UTF-16 BE
  12. void print_str_bytes(const char* str, size_t len) {
  13.     for (int i = 0; i < len; i++) {
  14.         char s = str[i];
  15.         std::cout << i << ": " << ((int)s & 0xff) << std::endl;
  16.     }
  17. }
  18. int convert_encoding(char** dst, size_t* dst_len, const char* src, size_t src_len, const char* to_encoding, const char* from_encoding) {
  19.     iconv_t cd = iconv_open(to_encoding, from_encoding);
  20.     if (cd == reinterpret_cast<iconv_t>(-1)) {
  21.         std::cerr << "iconv_open error: " << strerror(errno) << std::endl;
  22.         return -1;
  23.     }
  24.     size_t sl = src_len;
  25.     size_t total = 2 * src_len; // BOM头占2字节 FEFF(UTF-16 BE)
  26.     size_t tmp = total;
  27.     char* outbuf = new char[total];
  28.     std::unique_ptr<char[]> ptr(outbuf);    // 用智能指针控制内存自动释放
  29.     int res = iconv(cd, const_cast<char**>(&src), &sl, &outbuf, &tmp);
  30.     if (res == -1) {
  31.         std::cerr << "iconv error: " << strerror(errno) << std::endl;
  32.         iconv_close(cd);
  33.         return -1;
  34.     }
  35.     // FIXME 此处使用的是全部重新转换方案,比较浪费性能,仅适用于演示,实际应该注重src未转换的长度值
  36.     while (sl != 0) {
  37.         total *= 2;  // 倍乘
  38.         tmp = total;
  39.         ptr.release();
  40.         delete[] outbuf;
  41.         outbuf = new char[total];    // 重新申请内存
  42.         ptr = std::unique_ptr<char[]>(outbuf);
  43.         res = iconv(cd, const_cast<char**>(&src), &sl, &outbuf, &tmp);
  44.         if (res == -1) {
  45.             std::cerr << "iconv error: " << strerror(errno) << std::endl;
  46.             iconv_close(cd);
  47.             return -1;
  48.         }
  49.     }
  50.     std::cout << "use bytes: " << total - tmp << std::endl;
  51.     *dst_len = total - tmp;
  52.     *dst = ptr.release();   // 不再需要智能指针管控内存
  53.     res = iconv_close(cd);
  54.     if (res == -1) {
  55.         std::cerr << "iconv_close error: " << strerror(errno) << std::endl;
  56.         return -1;
  57.     }
  58.    
  59.     return 0;
  60. }
  61. int main(int argc, char* argv[]) {
  62.     if (argc != 2) {
  63.         std::cout << "./iconv <src-str>" << std::endl;
  64.         return 0;
  65.     }
  66.     char* str = argv[1];
  67.     print_str_bytes(str, strlen(str));
  68.     char* dst = nullptr;
  69.     size_t dst_len = 0;
  70.     int res = convert_encoding(&dst, &dst_len, str, strlen(str), "UTF-16", "UTF-8");
  71.     if (res == -1) {
  72.         std::cerr << "oops..." << std::endl;
  73.         exit(-1);
  74.     }
  75.     std::cout << "dst_len: " << dst_len << std::endl;
  76.     print_str_bytes(dst, dst_len);
  77.     // 写入到文件
  78.     int fd = open("out.txt", O_RDWR| O_CREAT | O_TRUNC, S_IRWXU);
  79.     if (fd == -1) {
  80.         std::cerr << "open out.txt error: " << strerror(errno) << std::endl;
  81.         exit(-1);
  82.     }
  83.     write(fd, dst, dst_len);
  84.     return 0;
  85. }
复制代码
编译:
  1. c++ -std=c++14 -liconv iconv.cpp -o iconv
复制代码
输出:
  1. ./iconv 你hao,世界
  2. 0: 228
  3. 1: 189
  4. 2: 160
  5. 3: 104
  6. 4: 97
  7. 5: 111
  8. 6: 239
  9. 7: 188
  10. 8: 140
  11. 9: 228
  12. 10: 184
  13. 11: 150
  14. 12: 231
  15. 13: 149
  16. 14: 140
  17. use bytes: 16
  18. dst_len: 16
  19. 0: 254
  20. 1: 255
  21. 2: 79
  22. 3: 96
  23. 4: 0
  24. 5: 104
  25. 6: 0
  26. 7: 97
  27. 8: 0
  28. 9: 111
  29. 10: 255
  30. 11: 12
  31. 12: 78
  32. 13: 22
  33. 14: 117
  34. 15: 76
复制代码
利用iconv -l命令
  1. ANSI_X3.4-1968 ANSI_X3.4-1986 ASCII CP367 IBM367 ISO-IR-6 ISO646-US ISO_646.IRV:1991 US US-ASCII CSASCII
  2. UTF-8 UTF8
  3. UTF-8-MAC UTF8-MAC
  4. ISO-10646-UCS-2 UCS-2 CSUNICODE
  5. UCS-2BE UNICODE-1-1 UNICODEBIG CSUNICODE11
  6. UCS-2LE UNICODELITTLE
  7. ISO-10646-UCS-4 UCS-4 CSUCS4
  8. UCS-4BE
  9. UCS-4LE
  10. UTF-16
  11. UTF-16BE
  12. UTF-16LE
  13. UTF-32
  14. UTF-32BE
  15. UTF-32LE
  16. UNICODE-1-1-UTF-7 UTF-7 CSUNICODE11UTF7
  17. UCS-2-INTERNAL
  18. UCS-2-SWAPPED
  19. UCS-4-INTERNAL
  20. UCS-4-SWAPPED
  21. C99
  22. JAVA
  23. CP819 IBM819 ISO-8859-1 ISO-IR-100 ISO8859-1 ISO_8859-1 ISO_8859-1:1987 L1 LATIN1 CSISOLATIN1
  24. ISO-8859-2 ISO-IR-101 ISO8859-2 ISO_8859-2 ISO_8859-2:1987 L2 LATIN2 CSISOLATIN2
  25. ISO-8859-3 ISO-IR-109 ISO8859-3 ISO_8859-3 ISO_8859-3:1988 L3 LATIN3 CSISOLATIN3
  26. ISO-8859-4 ISO-IR-110 ISO8859-4 ISO_8859-4 ISO_8859-4:1988 L4 LATIN4 CSISOLATIN4
  27. CYRILLIC ISO-8859-5 ISO-IR-144 ISO8859-5 ISO_8859-5 ISO_8859-5:1988 CSISOLATINCYRILLIC
  28. ARABIC ASMO-708 ECMA-114 ISO-8859-6 ISO-IR-127 ISO8859-6 ISO_8859-6 ISO_8859-6:1987 CSISOLATINARABIC
  29. ECMA-118 ELOT_928 GREEK GREEK8 ISO-8859-7 ISO-IR-126 ISO8859-7 ISO_8859-7 ISO_8859-7:1987 ISO_8859-7:2003 CSISOLATINGREEK
  30. HEBREW ISO-8859-8 ISO-IR-138 ISO8859-8 ISO_8859-8 ISO_8859-8:1988 CSISOLATINHEBREW
  31. ISO-8859-9 ISO-IR-148 ISO8859-9 ISO_8859-9 ISO_8859-9:1989 L5 LATIN5 CSISOLATIN5
  32. ISO-8859-10 ISO-IR-157 ISO8859-10 ISO_8859-10 ISO_8859-10:1992 L6 LATIN6 CSISOLATIN6
  33. ISO-8859-11 ISO8859-11 ISO_8859-11
  34. ISO-8859-13 ISO-IR-179 ISO8859-13 ISO_8859-13 L7 LATIN7
  35. ISO-8859-14 ISO-CELTIC ISO-IR-199 ISO8859-14 ISO_8859-14 ISO_8859-14:1998 L8 LATIN8
  36. ISO-8859-15 ISO-IR-203 ISO8859-15 ISO_8859-15 ISO_8859-15:1998 LATIN-9
  37. ISO-8859-16 ISO-IR-226 ISO8859-16 ISO_8859-16 ISO_8859-16:2001 L10 LATIN10
  38. KOI8-R CSKOI8R
  39. KOI8-U
  40. KOI8-RU
  41. CP1250 MS-EE WINDOWS-1250
  42. CP1251 MS-CYRL WINDOWS-1251
  43. CP1252 MS-ANSI WINDOWS-1252
  44. CP1253 MS-GREEK WINDOWS-1253
  45. CP1254 MS-TURK WINDOWS-1254
  46. CP1255 MS-HEBR WINDOWS-1255
  47. CP1256 MS-ARAB WINDOWS-1256
  48. CP1257 WINBALTRIM WINDOWS-1257
  49. CP1258 WINDOWS-1258
  50. 850 CP850 IBM850 CSPC850MULTILINGUAL
  51. 862 CP862 IBM862 CSPC862LATINHEBREW
  52. 866 CP866 IBM866 CSIBM866
  53. MAC MACINTOSH MACROMAN CSMACINTOSH
  54. MACCENTRALEUROPE
  55. MACICELAND
  56. MACCROATIAN
  57. MACROMANIA
  58. MACCYRILLIC
  59. MACUKRAINE
  60. MACGREEK
  61. MACTURKISH
  62. MACHEBREW
  63. MACARABIC
  64. MACTHAI
  65. HP-ROMAN8 R8 ROMAN8 CSHPROMAN8
  66. NEXTSTEP
  67. ARMSCII-8
  68. GEORGIAN-ACADEMY
  69. GEORGIAN-PS
  70. KOI8-T
  71. CP154 CYRILLIC-ASIAN PT154 PTCP154 CSPTCP154
  72. MULELAO-1
  73. CP1133 IBM-CP1133
  74. ISO-IR-166 TIS-620 TIS620 TIS620-0 TIS620.2529-1 TIS620.2533-0 TIS620.2533-1
  75. CP874 WINDOWS-874
  76. VISCII VISCII1.1-1 CSVISCII
  77. TCVN TCVN-5712 TCVN5712-1 TCVN5712-1:1993
  78. ISO-IR-14 ISO646-JP JIS_C6220-1969-RO JP CSISO14JISC6220RO
  79. JISX0201-1976 JIS_X0201 X0201 CSHALFWIDTHKATAKANA
  80. ISO-IR-87 JIS0208 JIS_C6226-1983 JIS_X0208 JIS_X0208-1983 JIS_X0208-1990 X0208 CSISO87JISX0208
  81. ISO-IR-159 JIS_X0212 JIS_X0212-1990 JIS_X0212.1990-0 X0212 CSISO159JISX02121990
  82. CN GB_1988-80 ISO-IR-57 ISO646-CN CSISO57GB1988
  83. CHINESE GB_2312-80 ISO-IR-58 CSISO58GB231280
  84. CN-GB-ISOIR165 ISO-IR-165
  85. ISO-IR-149 KOREAN KSC_5601 KS_C_5601-1987 KS_C_5601-1989 CSKSC56011987
  86. EUC-JP EUCJP EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE CSEUCPKDFMTJAPANESE
  87. MS_KANJI SHIFT-JIS SHIFT_JIS SJIS CSSHIFTJIS
  88. CP932
  89. ISO-2022-JP CSISO2022JP
  90. ISO-2022-JP-1
  91. ISO-2022-JP-2 CSISO2022JP2
  92. CN-GB EUC-CN EUCCN GB2312 CSGB2312
  93. GBK
  94. CP936 MS936 WINDOWS-936
  95. GB18030
  96. ISO-2022-CN CSISO2022CN
  97. ISO-2022-CN-EXT
  98. HZ HZ-GB-2312
  99. EUC-TW EUCTW CSEUCTW
  100. BIG-5 BIG-FIVE BIG5 BIGFIVE CN-BIG5 CSBIG5
  101. CP950
  102. BIG5-HKSCS:1999
  103. BIG5-HKSCS:2001
  104. BIG5-HKSCS BIG5-HKSCS:2004 BIG5HKSCS
  105. EUC-KR EUCKR CSEUCKR
  106. CP949 UHC
  107. CP1361 JOHAB
  108. ISO-2022-KR CSISO2022KR
  109. CP856
  110. CP922
  111. CP943
  112. CP1046
  113. CP1124
  114. CP1129
  115. CP1161 IBM-1161 IBM1161 CSIBM1161
  116. CP1162 IBM-1162 IBM1162 CSIBM1162
  117. CP1163 IBM-1163 IBM1163 CSIBM1163
  118. DEC-KANJI
  119. DEC-HANYU
  120. 437 CP437 IBM437 CSPC8CODEPAGE437
  121. CP737
  122. CP775 IBM775 CSPC775BALTIC
  123. 852 CP852 IBM852 CSPCP852
  124. CP853
  125. 855 CP855 IBM855 CSIBM855
  126. 857 CP857 IBM857 CSIBM857
  127. CP858
  128. 860 CP860 IBM860 CSIBM860
  129. 861 CP-IS CP861 IBM861 CSIBM861
  130. 863 CP863 IBM863 CSIBM863
  131. CP864 IBM864 CSIBM864
  132. 865 CP865 IBM865 CSIBM865
  133. 869 CP-GR CP869 IBM869 CSIBM869
  134. CP1125
  135. EUC-JISX0213
  136. SHIFT_JISX0213
  137. ISO-2022-JP-3
  138. BIG5-2003
  139. ISO-IR-230 TDS565
  140. ATARI ATARIST
  141. RISCOS-LATIN1
复制代码


免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。
回复

使用道具 举报

0 个回复

倒序浏览

快速回复

您需要登录后才可以回帖 登录 or 立即注册

本版积分规则

吴旭华

论坛元老
这个人很懒什么都没写!
快速回复 返回顶部 返回列表