当时不太会用 postgresql 在 windows 下面的 powershell 直接导出为 sql 文件,没看里面的内容,最近需要恢复数据发现内容中中文全是乱码,英文是正常的。
尝试了很多方法后,发现文件是 utf16 编码的Little-endian UTF-16 Unicode text, with very long lines, with CRLF line terminators
使用下面的命令可以导出为 gbk 编码,部分中文可以还原,但还是有相当多的字符无法还原
iconv -f UTF-16 -t GBK < export.sql > out.sql
涉及到相关问题的,只有这个文章有所说明 https://www.cnblogs.com/xyb930826/p/4657462.html
尝试了其中 C 代码,仍然是有相同的问题,有些编码无法转换成功
使用 python 重写了代码,实现后,仍然是部分中文无法解码成功
def read_map():
map_value = {}
try:
with open("UnicodeToGBK.txt", "r") as f:
for line in f:
utf_str, gbk_str = line.strip().split()
utf_id = int(utf_str, 16)
gbk_id = int(gbk_str, 16)
map_value[utf_id] = gbk_id
except IOError:
print("Error reading mapping file!")
return None
return map_value
def convert_utf16_to_gbk1(input_file, output_file):
map_value = read_map()
if map_value is None:
print("Convert Failed!")
return
try:
with open(input_file, "rb") as f_in, open(output_file, "wb") as f_out:
# 跳过 BOM
bom = f_in.read(2)
if bom != b'\xff\xfe':
f_in.seek(0)
while True:
ch = f_in.read(1)
cl = f_in.read(1)
if not ch or not cl:
break
ch = ord(ch)
cl = ord(cl)
if ch > 0x7f and cl == 0x00:
ch2 = ord(f_in.read(1))
cl2 = ord(f_in.read(1))
f_out.write(bytes([ch, ch2]))
elif ch <= 0x7f and cl == 0x00:
f_out.write(bytes([ch]))
else:
utf = cl * 256 + ch
gbk = map_value.get(utf, 0)
f_out.write(bytes([gbk // 256, gbk % 256]))
except IOError:
print("Error processing files!")
return
print("Conversion completed successfully!")
def convert_utf16_to_gbk(input_file, output_file):
map_value = read_map()
if map_value is None:
print("Convert Failed!")
return
try:
with open(input_file, "rb") as f_in, open(output_file, "wb") as f_out:
# 跳过 BOM
bom = f_in.read(2)
if bom != b'\xff\xfe':
f_in.seek(0)
while True:
low_byte = f_in.read(1)
high_byte = f_in.read(1)
if not low_byte or not high_byte:
break
low_byte = ord(low_byte)
high_byte = ord(high_byte)
# 正确处理小端序 UTF-16
utf = (high_byte << 8) | low_byte
if utf in map_value:
gbk = map_value[utf]
f_out.write(bytes([gbk // 256, gbk % 256]))
elif utf <= 0x7f:
# ASCII 字符
f_out.write(bytes([utf]))
else:
# 处理未映射的字符
print(f"Unable to convert UTF-16 character: U+{utf:04X}", chr(utf))
# hex_str = f"\\u{utf:04X}"
# f_out.write(hex_str.encode('ascii'))
# f_out.write(bytes.fromhex("E046E160")) # 或者选择其他替代字符
except IOError:
print("Error processing files!")
return
print("Conversion completed successfully!")
if __name__ == "__main__":
convert_utf16_to_gbk("export.sql", "out.sql")
比如 UnicodeToGBK.txt 的对应关系如下
90C5 DBA4
90C6 E042
90C7 DBA8
90C8 E043
90C9 E044
90CA BDBC
90CB E045
90CC E046
90CD E047
90CE C0C9
90CF DBA3
90D0 DBA6
90D1 D6A3
错误的字节
Unable to convert UTF-16 character: U+E046
Unable to convert UTF-16 character: U+E160
Unable to convert UTF-16 character: U+E1E2
Unable to convert UTF-16 character: U+20AC €
Unable to convert UTF-16 character: U+E0A5
Unable to convert UTF-16 character: U+E195
Unable to convert UTF-16 character: U+E218
Unable to convert UTF-16 character: U+20AC €
Unable to convert UTF-16 character: U+E1EC
Unable to convert UTF-16 character: U+20AC €
Unable to convert UTF-16 character: U+20AC €
Unable to convert UTF-16 character: U+E1BC
Unable to convert UTF-16 character: U+20AC €
Unable to convert UTF-16 character: U+20AC €
Unable to convert UTF-16 character: U+E11E
Unable to convert UTF-16 character: U+E1C0
Unable to convert UTF-16 character: U+E57D
Unable to convert UTF-16 character: U+20AC €
Unable to convert UTF-16 character: U+E21A
Unable to convert UTF-16 character: U+E6E7
Unable to convert UTF-16 character: U+E11C
Unable to convert UTF-16 character: U+20AC €
Unable to convert UTF-16 character: U+20AC €
Unable to convert UTF-16 character: U+E11C
Unable to convert UTF-16 character: U+20AC €
Unable to convert UTF-16 character: U+E6E7
论坛上技术人员较多,有没有遇到过相关问题,不吝赐教
这是一个专为移动设备优化的页面(即为了让你能够在 Google 搜索结果里秒开这个页面),如果你希望参与 V2EX 社区的讨论,你可以继续到 V2EX 上打开本讨论主题的完整版本。
V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。
V2EX is a community of developers, designers and creative people.