From 0ed125e96697a78ae53f81ae87dea56c98972750 Mon Sep 17 00:00:00 2001 From: shelllee Date: Fri, 20 Aug 2021 21:22:52 +0800 Subject: [PATCH 1/3] Fixed: "UnicodeDecodeError: 'gbk' codec can't decode byte 0xf1 in position 449: illegal multibyte sequence". --- analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analyzer.py b/analyzer.py index f42448a..7bdc179 100644 --- a/analyzer.py +++ b/analyzer.py @@ -123,7 +123,7 @@ def parse(self, filepath): # File id 0 is always the current file (we store only the base file name without .txt extension). self._external_references[0] = self._file_index.get_id(os.path.basename(filepath[:-4])) - with open(filepath) as f: + with open(filepath, encoding="latin1") as f: line = f.readline() # Parse external references. if line == "External References\n": From 39a97c6ae592ccf4f1fc4b985849c48f88f535cd Mon Sep 17 00:00:00 2001 From: shelllee Date: Fri, 20 Aug 2021 21:40:15 +0800 Subject: [PATCH 2/3] Maybe better. --- analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analyzer.py b/analyzer.py index 7bdc179..58cf622 100644 --- a/analyzer.py +++ b/analyzer.py @@ -142,7 +142,7 @@ def parse(self, filepath): global_index = self._file_index.get_id(file) self._external_references[local_index] = global_index - with open(filepath, errors='ignore') as f: + with open(filepath, encoding="latin1") as f: data = f.read() # Parse the whole file, extract all objects. From 9e6eed0b34f4f97e0bdd47f30656f17e1fb5d7d3 Mon Sep 17 00:00:00 2001 From: shelllee Date: Wed, 22 Sep 2021 18:49:34 +0800 Subject: [PATCH 3/3] Read as latin1 and try decode to utf-8 line by line. --- analyzer.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/analyzer.py b/analyzer.py index 58cf622..b5b0d69 100644 --- a/analyzer.py +++ b/analyzer.py @@ -142,15 +142,29 @@ def parse(self, filepath): global_index = self._file_index.get_id(file) self._external_references[local_index] = global_index + # read as latin1 and try decode to utf-8 line by line + data = "" + with open(filepath, encoding="latin1") as f: - data = f.read() + line = f.readline() + try: + line = line.encode("latin1").decode("utf-8", "ignore") + finally: + data += line + + while line: + line = f.readline() + try: + line = line.encode("latin1").decode("utf-8", "ignore") + finally: + data += line # Parse the whole file, extract all objects. regex = re.compile(r"ID: (\-?[a-f0-9]+) \(ClassID: (\d+)\) (\w+)([\s\S]*?(?=(\n{2,}ID:|$)))") matches = regex.findall(data) objects = {} - + # Parse individual objects. for match in matches: try: