From 9b8702458f3e6fa99ec5bf5c9aecdebeb52120c2 Mon Sep 17 00:00:00 2001 From: Maisha Thasin Date: Tue, 30 Apr 2024 16:28:17 -0400 Subject: [PATCH 1/6] add system messages --- doctran/doctran.py | 5 +++-- doctran/transformers/transformers.py | 6 ++++-- examples.ipynb | 18 ++++++++++++++++-- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/doctran/doctran.py b/doctran/doctran.py index 96894f2..cfed4b2 100644 --- a/doctran/doctran.py +++ b/doctran/doctran.py @@ -75,6 +75,7 @@ class Document(BaseModel): content_type: ContentType raw_content: str transformed_content: str + system: str config: DoctranConfig extracted_properties: Optional[Dict] = {} metadata: Optional[Dict[str, Any]] = None @@ -193,7 +194,7 @@ def __init__(self, openai_api_key: str = None, openai_model: str = "gpt-4", open if os.environ.get('OPENAI_API_VERSION'): self.config.openai.api_version = os.environ['OPENAI_API_VERSION'] - def parse(self, *, content: str, content_type: ContentType = "text", uri: str = None, metadata: dict = None) -> Document: + def parse(self, *, content: str,system:str, content_type: ContentType = "text", uri: str = None, metadata: dict = None) -> Document: ''' Parse raw text and apply different chunking schemes based on the content type. @@ -204,5 +205,5 @@ def parse(self, *, content: str, content_type: ContentType = "text", uri: str = uri = str(uuid.uuid4()) if content_type == ContentType.text.value: # TODO: Optional chunking for documents that are too large - document = Document(id=str(uuid.uuid4()), content_type=content_type, raw_content=content, transformed_content=content, config=self.config, uri=uri, metadata=metadata) + document = Document(id=str(uuid.uuid4()), content_type=content_type, raw_content=content, transformed_content=content,system=system, config=self.config, uri=uri, metadata=metadata) return document diff --git a/doctran/transformers/transformers.py b/doctran/transformers/transformers.py index 6494e62..6bd72be 100644 --- a/doctran/transformers/transformers.py +++ b/doctran/transformers/transformers.py @@ -7,6 +7,7 @@ import tiktoken from doctran import Document, DoctranConfig, ExtractProperty, RecognizerEntity + class TooManyTokensException(Exception): def __init__(self, content_token_size: int, token_limit: int): super().__init__(f"OpenAI document transformation failed. The document is {content_token_size} tokens long, which exceeds the token limit of {token_limit}.") @@ -59,7 +60,9 @@ def executeOpenAICall(self, document: Document) -> Document: function_call = OpenAIFunctionCall( seed=self.config.openai_deployment_id, model=self.config.openai_model, - messages=[{"role": "user", "content": document.transformed_content}], + messages=[{"role": "system", "content":document.system}, + + {"role": "user", "content": document.transformed_content}], tools=[{ "type": "function", "function": { @@ -80,7 +83,6 @@ def executeOpenAICall(self, document: Document) -> Document: f"Setting a higher token limit may fix this error. JSON returned: {arguments}") first_value = next(iter(arguments.values())) if len(arguments) > 1 or not isinstance(first_value, str): - # If multiple arguments or a dict/list is returned, treat arguments as extracted values document.extracted_properties = document.extracted_properties or arguments else: # If there is only one argument and it's a string, treat arguments as transformed content diff --git a/examples.ipynb b/examples.ipynb index e561131..1b79811 100644 --- a/examples.ipynb +++ b/examples.ipynb @@ -4,7 +4,21 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'openai'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mjson\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdoctran\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Doctran, ExtractProperty\n", + "File \u001b[0;32m~/Desktop/doctran/doctran/__init__.py:1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdoctran\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Doctran, Document, DoctranConfig, ContentType, ExtractProperty, RecognizerEntity, Transformation\n", + "File \u001b[0;32m~/Desktop/doctran/doctran/doctran.py:4\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mimportlib\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01myaml\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mopenai\u001b[39;00m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01muuid\u001b[39;00m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01menum\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Enum\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'openai'" + ] + } + ], "source": [ "import json\n", "import os\n", @@ -590,7 +604,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.1" }, "orig_nbformat": 4 }, From 074c7e2c7ced19c0e4a85067b1ba0760d9969bf7 Mon Sep 17 00:00:00 2001 From: Maisha Thasin Date: Tue, 30 Apr 2024 16:32:12 -0400 Subject: [PATCH 2/6] rmv examples --- examples.ipynb | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/examples.ipynb b/examples.ipynb index 1b79811..393babf 100644 --- a/examples.ipynb +++ b/examples.ipynb @@ -2,23 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'openai'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mjson\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdoctran\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Doctran, ExtractProperty\n", - "File \u001b[0;32m~/Desktop/doctran/doctran/__init__.py:1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdoctran\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Doctran, Document, DoctranConfig, ContentType, ExtractProperty, RecognizerEntity, Transformation\n", - "File \u001b[0;32m~/Desktop/doctran/doctran/doctran.py:4\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mimportlib\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01myaml\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mopenai\u001b[39;00m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01muuid\u001b[39;00m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01menum\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Enum\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'openai'" - ] - } - ], + "outputs": [], "source": [ "import json\n", "import os\n", From 5bd55275b85e27a6916c82d11c3b9f031e5fd448 Mon Sep 17 00:00:00 2001 From: Mai <63172145+maishathasin@users.noreply.github.com> Date: Tue, 30 Apr 2024 16:33:15 -0400 Subject: [PATCH 3/6] Update examples.ipynb --- examples.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples.ipynb b/examples.ipynb index 393babf..ab72ca1 100644 --- a/examples.ipynb +++ b/examples.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ From 5b291e1e4cf5f57e5c04bb347229c3a6a50127a0 Mon Sep 17 00:00:00 2001 From: Mai <63172145+maishathasin@users.noreply.github.com> Date: Tue, 30 Apr 2024 16:34:04 -0400 Subject: [PATCH 4/6] Update examples.ipynb --- examples.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples.ipynb b/examples.ipynb index ab72ca1..e561131 100644 --- a/examples.ipynb +++ b/examples.ipynb @@ -590,7 +590,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.1" + "version": "3.11.4" }, "orig_nbformat": 4 }, From 0f0b831e01d5ce2b60ce41fa89f3c6906622cbe9 Mon Sep 17 00:00:00 2001 From: Maisha Thasin Date: Tue, 30 Apr 2024 16:35:34 -0400 Subject: [PATCH 5/6] rmv examples --- doctran/transformers/transformers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/doctran/transformers/transformers.py b/doctran/transformers/transformers.py index 6bd72be..484259b 100644 --- a/doctran/transformers/transformers.py +++ b/doctran/transformers/transformers.py @@ -83,6 +83,7 @@ def executeOpenAICall(self, document: Document) -> Document: f"Setting a higher token limit may fix this error. JSON returned: {arguments}") first_value = next(iter(arguments.values())) if len(arguments) > 1 or not isinstance(first_value, str): + # If multiple arguments or a dict/list is returned, treat arguments as extracted values document.extracted_properties = document.extracted_properties or arguments else: # If there is only one argument and it's a string, treat arguments as transformed content From c9f47eacf77110dd2ce03e95f944443c78b4425e Mon Sep 17 00:00:00 2001 From: Maisha Thasin Date: Fri, 3 May 2024 16:01:17 -0400 Subject: [PATCH 6/6] add optional system message --- doctran/doctran.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doctran/doctran.py b/doctran/doctran.py index cfed4b2..a870d5a 100644 --- a/doctran/doctran.py +++ b/doctran/doctran.py @@ -75,7 +75,7 @@ class Document(BaseModel): content_type: ContentType raw_content: str transformed_content: str - system: str + system: Optional[str] config: DoctranConfig extracted_properties: Optional[Dict] = {} metadata: Optional[Dict[str, Any]] = None @@ -194,7 +194,7 @@ def __init__(self, openai_api_key: str = None, openai_model: str = "gpt-4", open if os.environ.get('OPENAI_API_VERSION'): self.config.openai.api_version = os.environ['OPENAI_API_VERSION'] - def parse(self, *, content: str,system:str, content_type: ContentType = "text", uri: str = None, metadata: dict = None) -> Document: + def parse(self, *, content: str,system: Optional[str] = None, content_type: ContentType = "text", uri: str = None, metadata: dict = None) -> Document: ''' Parse raw text and apply different chunking schemes based on the content type.