From fe4e4a3b380f7c8e1fefebfb0e2c4c2f1f536099 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9o=20Joubert?= Date: Wed, 16 Jan 2019 23:36:50 +0100 Subject: [PATCH 1/4] More flexible handling of encoding error --- deltas/segmenters/segments.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deltas/segmenters/segments.py b/deltas/segmenters/segments.py index d279dd9..c8ea2f2 100644 --- a/deltas/segmenters/segments.py +++ b/deltas/segmenters/segments.py @@ -94,7 +94,7 @@ class MatchableSegment(Segment): def initialize(self, *args, **kwargs): super().initialize(*args, **kwargs) - self.sha1 = hashlib.sha1(bytes(str(self), 'utf-8')) + self.sha1 = hashlib.sha1(bytes(str(self), 'utf-8', errors = "replace")) self.match = None def __eq__(self, other): @@ -117,7 +117,7 @@ def __setstate__(self, args): self.initialize(*args) def append(self, subsegment): super().append(subsegment) - self.sha1.update(bytes(str(subsegment), 'utf-8')) + self.sha1.update(bytes(str(subsegment), 'utf-8', errors = "replace")) def extend(self, subsegments): for subsegment in subsegments: From 3c259208bd68b715a8a9321b69cd7a4d1a7e496c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9o=20Joubert?= Date: Sun, 24 Mar 2019 19:20:36 +0100 Subject: [PATCH 2/4] apply_get_a and apply_get_b : building tokens from operations --- deltas/__init__.py | 4 +++- deltas/apply_get_a.py | 20 ++++++++++++++++++++ deltas/apply_get_b.py | 21 +++++++++++++++++++++ deltas/operations.py | 2 +- notes.md | 17 +++++++++++++++++ test_apply_get_a.py | 33 +++++++++++++++++++++++++++++++++ 6 files changed, 95 insertions(+), 2 deletions(-) create mode 100644 deltas/apply_get_a.py create mode 100644 deltas/apply_get_b.py create mode 100644 notes.md create mode 100644 test_apply_get_a.py diff --git a/deltas/__init__.py b/deltas/__init__.py index 5b2d37e..c480103 100644 --- a/deltas/__init__.py +++ b/deltas/__init__.py @@ -1,4 +1,6 @@ from .apply import apply +from .apply_get_a import apply_get_a +from .apply_get_b import apply_get_b from .operations import Operation, Insert, Delete, Equal from .algorithms.diff_engine import DiffEngine from .algorithms import segment_matcher, SegmentMatcher @@ -11,7 +13,7 @@ from .about import (__name__, __version__, __author__, __author_email__, __description__, __license__, __url__) -__all__ = [apply, +__all__ = [apply, apply_get_a, apply_get_b, Operation, Insert, Delete, Equal, DiffEngine, segment_matcher, SegmentMatcher, diff --git a/deltas/apply_get_a.py b/deltas/apply_get_a.py new file mode 100644 index 0000000..58a8c4c --- /dev/null +++ b/deltas/apply_get_a.py @@ -0,0 +1,20 @@ +def apply_get_a(operations_diff_file): + length_a = max([operation["a2"] for operation in operations_diff_file]) + a = [''] * length_a + + for operation in operations_diff_file: + + if operation["name"] == "equal" or operation["name"] == "delete": + #print("Equal: {0}".format(str(a_tokens[operation.a1:operation.a2]))) + if "tokens" in operation.keys(): + a[operation["a1"]:operation["a2"]] = operation["tokens"] + + elif operation["name"] == "insert": + #print("Insert: {0}".format(str(b_tokens[operation.b1:operation.b2]))) + pass + + else: + raise TypeError("Unexpected operation type " + \ + "{0}".format(type(operation))) + + return ' '.join(a) diff --git a/deltas/apply_get_b.py b/deltas/apply_get_b.py new file mode 100644 index 0000000..0e4bedf --- /dev/null +++ b/deltas/apply_get_b.py @@ -0,0 +1,21 @@ +def apply_get_b(operations_diff_file): + + length_b = max([operation["b2"] for operation in operations_diff_file]) + b = [''] * length_b + + for operation in operations_diff_file: + + if operation["name"] == "equal" or operation["name"] == "insert": + #print("Equal: {0}".format(str(a_tokens[operation.a1:operation.a2]))) + if "tokens" in operation.keys(): + b[operation["b1"]:operation["b2"]] = operation["tokens"] + + elif operation["name"] == "delete": + #print("Insert: {0}".format(str(b_tokens[operation.b1:operation.b2]))) + pass + + else: + raise TypeError("Unexpected operation type " + \ + "{0}".format(type(operation))) + + return ' '.join(b) diff --git a/deltas/operations.py b/deltas/operations.py index 1890183..ae8df1e 100644 --- a/deltas/operations.py +++ b/deltas/operations.py @@ -103,7 +103,7 @@ def __new__(cls, a1, a2, b1, b2, name=None): return Operation.__new__(cls, "equal", a1, a2, b1, b2) def relevant_tokens(self, a, b): - return a[self.a1:self.a2] + return b[self.b1:self.b2] def print_operations(operations, a, b): diff --git a/notes.md b/notes.md new file mode 100644 index 0000000..d07290f --- /dev/null +++ b/notes.md @@ -0,0 +1,17 @@ +# Faire l'exemple suivant : + +- un chien est un sale animal +- Iris est un sale animal +- Iris est un sale chat + +# incompréhension + +- fonctionnement de la fonction apply : pourquoi yield ? + +# Différences dans les operations + +- dans les diff, il y a les tokens qui permettent de reconstruire le fichier, pas dans le apply **voilà pouruqoi le package original ne permet pas cette fonction** + +# Manuel d'utilisation + +- s'assurer que l'on a bien le même tokenizer de l'un à l'autre diff --git a/test_apply_get_a.py b/test_apply_get_a.py new file mode 100644 index 0000000..44c4f2d --- /dev/null +++ b/test_apply_get_a.py @@ -0,0 +1,33 @@ +from deltas import segment_matcher, text_split +from deltas import apply_get_a, apply_get_b +from deltas import Operation, Insert, Delete, Equal + +import deltas +from pprint import pprint + +a = text_split.tokenize("This is some text. This is some other text.") +b = text_split.tokenize("This is some other text. This is some text.") +operations = segment_matcher.diff(a, b) + +operations_format = [] +for op in operations: + tmp = { + 'name': op.name, + 'a1': op.a1, + 'b1': op.b1, + 'a2': op.a2, + 'b2': op.b2, + 'tokens' : [str(token) for token in op.relevant_tokens(a,b)] + } + operations_format.append(tmp) + +print(apply_get_a(operations_format)) +print(apply_get_b(operations_format)) + +operations_new = [] +for op in operations_format: + tmp = Operation(name = op["name"], a1 = op["a1"], a2 = op["a2"], b1 = op["b1"], b2 = op["b2"]) + operations_new.append(tmp) + +print(operations) +print(operations_new) From 86e74d265c05457d0aec412f5b64812a0cc81ce2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9o=20Joubert?= Date: Mon, 25 Mar 2019 07:20:56 +0100 Subject: [PATCH 3/4] handling errors --- deltas/apply_get_a.py | 34 ++++++++++++++++++++-------------- deltas/apply_get_b.py | 34 ++++++++++++++++++++-------------- 2 files changed, 40 insertions(+), 28 deletions(-) diff --git a/deltas/apply_get_a.py b/deltas/apply_get_a.py index 58a8c4c..8e2dff4 100644 --- a/deltas/apply_get_a.py +++ b/deltas/apply_get_a.py @@ -1,20 +1,26 @@ +from pprint import pprint + def apply_get_a(operations_diff_file): - length_a = max([operation["a2"] for operation in operations_diff_file]) - a = [''] * length_a - for operation in operations_diff_file: + try: + length_a = max([operation["a2"] for operation in operations_diff_file]) + a = [''] * length_a + + for operation in operations_diff_file: - if operation["name"] == "equal" or operation["name"] == "delete": - #print("Equal: {0}".format(str(a_tokens[operation.a1:operation.a2]))) - if "tokens" in operation.keys(): - a[operation["a1"]:operation["a2"]] = operation["tokens"] + if operation["name"] == "equal" or operation["name"] == "delete": + #print("Equal: {0}".format(str(a_tokens[operation.a1:operation.a2]))) + if "tokens" in operation.keys(): + a[operation["a1"]:operation["a2"]] = operation["tokens"] - elif operation["name"] == "insert": - #print("Insert: {0}".format(str(b_tokens[operation.b1:operation.b2]))) - pass + elif operation["name"] == "insert": + #print("Insert: {0}".format(str(b_tokens[operation.b1:operation.b2]))) + pass - else: - raise TypeError("Unexpected operation type " + \ - "{0}".format(type(operation))) + else: + raise TypeError("Unexpected operation type " + \ + "{0}".format(type(operation))) - return ' '.join(a) + return ' '.join(a) + except: + pprint(operations_diff_file) diff --git a/deltas/apply_get_b.py b/deltas/apply_get_b.py index 0e4bedf..1761b51 100644 --- a/deltas/apply_get_b.py +++ b/deltas/apply_get_b.py @@ -1,21 +1,27 @@ +from pprint import pprint + def apply_get_b(operations_diff_file): - length_b = max([operation["b2"] for operation in operations_diff_file]) - b = [''] * length_b + try: + + length_b = max([operation["b2"] for operation in operations_diff_file]) + b = [''] * length_b - for operation in operations_diff_file: + for operation in operations_diff_file: - if operation["name"] == "equal" or operation["name"] == "insert": - #print("Equal: {0}".format(str(a_tokens[operation.a1:operation.a2]))) - if "tokens" in operation.keys(): - b[operation["b1"]:operation["b2"]] = operation["tokens"] + if operation["name"] == "equal" or operation["name"] == "insert": + #print("Equal: {0}".format(str(a_tokens[operation.a1:operation.a2]))) + if "tokens" in operation.keys(): + b[operation["b1"]:operation["b2"]] = operation["tokens"] - elif operation["name"] == "delete": - #print("Insert: {0}".format(str(b_tokens[operation.b1:operation.b2]))) - pass + elif operation["name"] == "delete": + #print("Insert: {0}".format(str(b_tokens[operation.b1:operation.b2]))) + pass - else: - raise TypeError("Unexpected operation type " + \ - "{0}".format(type(operation))) + else: + raise TypeError("Unexpected operation type " + \ + "{0}".format(type(operation))) - return ' '.join(b) + return ' '.join(b) + except: + pprint(operations_diff_file) From c9833b8063c6a83fa44a46d1f7c371d283e1e0a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9o=20Joubert?= Date: Mon, 25 Mar 2019 07:58:06 +0100 Subject: [PATCH 4/4] Revert "handling errors" This reverts commit 86e74d265c05457d0aec412f5b64812a0cc81ce2. --- deltas/apply_get_a.py | 34 ++++++++++++++-------------------- deltas/apply_get_b.py | 34 ++++++++++++++-------------------- 2 files changed, 28 insertions(+), 40 deletions(-) diff --git a/deltas/apply_get_a.py b/deltas/apply_get_a.py index 8e2dff4..58a8c4c 100644 --- a/deltas/apply_get_a.py +++ b/deltas/apply_get_a.py @@ -1,26 +1,20 @@ -from pprint import pprint - def apply_get_a(operations_diff_file): + length_a = max([operation["a2"] for operation in operations_diff_file]) + a = [''] * length_a - try: - length_a = max([operation["a2"] for operation in operations_diff_file]) - a = [''] * length_a - - for operation in operations_diff_file: + for operation in operations_diff_file: - if operation["name"] == "equal" or operation["name"] == "delete": - #print("Equal: {0}".format(str(a_tokens[operation.a1:operation.a2]))) - if "tokens" in operation.keys(): - a[operation["a1"]:operation["a2"]] = operation["tokens"] + if operation["name"] == "equal" or operation["name"] == "delete": + #print("Equal: {0}".format(str(a_tokens[operation.a1:operation.a2]))) + if "tokens" in operation.keys(): + a[operation["a1"]:operation["a2"]] = operation["tokens"] - elif operation["name"] == "insert": - #print("Insert: {0}".format(str(b_tokens[operation.b1:operation.b2]))) - pass + elif operation["name"] == "insert": + #print("Insert: {0}".format(str(b_tokens[operation.b1:operation.b2]))) + pass - else: - raise TypeError("Unexpected operation type " + \ - "{0}".format(type(operation))) + else: + raise TypeError("Unexpected operation type " + \ + "{0}".format(type(operation))) - return ' '.join(a) - except: - pprint(operations_diff_file) + return ' '.join(a) diff --git a/deltas/apply_get_b.py b/deltas/apply_get_b.py index 1761b51..0e4bedf 100644 --- a/deltas/apply_get_b.py +++ b/deltas/apply_get_b.py @@ -1,27 +1,21 @@ -from pprint import pprint - def apply_get_b(operations_diff_file): - try: - - length_b = max([operation["b2"] for operation in operations_diff_file]) - b = [''] * length_b + length_b = max([operation["b2"] for operation in operations_diff_file]) + b = [''] * length_b - for operation in operations_diff_file: + for operation in operations_diff_file: - if operation["name"] == "equal" or operation["name"] == "insert": - #print("Equal: {0}".format(str(a_tokens[operation.a1:operation.a2]))) - if "tokens" in operation.keys(): - b[operation["b1"]:operation["b2"]] = operation["tokens"] + if operation["name"] == "equal" or operation["name"] == "insert": + #print("Equal: {0}".format(str(a_tokens[operation.a1:operation.a2]))) + if "tokens" in operation.keys(): + b[operation["b1"]:operation["b2"]] = operation["tokens"] - elif operation["name"] == "delete": - #print("Insert: {0}".format(str(b_tokens[operation.b1:operation.b2]))) - pass + elif operation["name"] == "delete": + #print("Insert: {0}".format(str(b_tokens[operation.b1:operation.b2]))) + pass - else: - raise TypeError("Unexpected operation type " + \ - "{0}".format(type(operation))) + else: + raise TypeError("Unexpected operation type " + \ + "{0}".format(type(operation))) - return ' '.join(b) - except: - pprint(operations_diff_file) + return ' '.join(b)