diff --git a/doctran/doctran.py b/doctran/doctran.py index 96894f2..a870d5a 100644 --- a/doctran/doctran.py +++ b/doctran/doctran.py @@ -75,6 +75,7 @@ class Document(BaseModel): content_type: ContentType raw_content: str transformed_content: str + system: Optional[str] config: DoctranConfig extracted_properties: Optional[Dict] = {} metadata: Optional[Dict[str, Any]] = None @@ -193,7 +194,7 @@ def __init__(self, openai_api_key: str = None, openai_model: str = "gpt-4", open if os.environ.get('OPENAI_API_VERSION'): self.config.openai.api_version = os.environ['OPENAI_API_VERSION'] - def parse(self, *, content: str, content_type: ContentType = "text", uri: str = None, metadata: dict = None) -> Document: + def parse(self, *, content: str,system: Optional[str] = None, content_type: ContentType = "text", uri: str = None, metadata: dict = None) -> Document: ''' Parse raw text and apply different chunking schemes based on the content type. @@ -204,5 +205,5 @@ def parse(self, *, content: str, content_type: ContentType = "text", uri: str = uri = str(uuid.uuid4()) if content_type == ContentType.text.value: # TODO: Optional chunking for documents that are too large - document = Document(id=str(uuid.uuid4()), content_type=content_type, raw_content=content, transformed_content=content, config=self.config, uri=uri, metadata=metadata) + document = Document(id=str(uuid.uuid4()), content_type=content_type, raw_content=content, transformed_content=content,system=system, config=self.config, uri=uri, metadata=metadata) return document diff --git a/doctran/transformers/transformers.py b/doctran/transformers/transformers.py index 6494e62..484259b 100644 --- a/doctran/transformers/transformers.py +++ b/doctran/transformers/transformers.py @@ -7,6 +7,7 @@ import tiktoken from doctran import Document, DoctranConfig, ExtractProperty, RecognizerEntity + class TooManyTokensException(Exception): def __init__(self, content_token_size: int, token_limit: int): super().__init__(f"OpenAI document transformation failed. The document is {content_token_size} tokens long, which exceeds the token limit of {token_limit}.") @@ -59,7 +60,9 @@ def executeOpenAICall(self, document: Document) -> Document: function_call = OpenAIFunctionCall( seed=self.config.openai_deployment_id, model=self.config.openai_model, - messages=[{"role": "user", "content": document.transformed_content}], + messages=[{"role": "system", "content":document.system}, + + {"role": "user", "content": document.transformed_content}], tools=[{ "type": "function", "function": {