diff --git a/lib/AppInfo/Application.php b/lib/AppInfo/Application.php index bd8a8cad..d1e09cd8 100644 --- a/lib/AppInfo/Application.php +++ b/lib/AppInfo/Application.php @@ -140,6 +140,10 @@ public function register(IRegistrationContext $context): void { $context->registerTaskProcessingProvider(\OCA\OpenAi\TaskProcessing\ReformatParagraphsProvider::class); } if ($isUsingOpenAI || $this->appConfig->getValueString(Application::APP_ID, 'analyze_image_provider_enabled') === '1') { + if (!class_exists('OCP\\TaskProcessing\\TaskTypes\\ImageToTextOpticalCharacterRecognition')) { + $context->registerTaskProcessingTaskType(\OCA\OpenAi\TaskProcessing\ImageToTextOcrTaskType::class); + } + $context->registerTaskProcessingProvider(\OCA\OpenAi\TaskProcessing\ImageToTextOcrProvider::class); if (!class_exists('OCP\\TaskProcessing\\TaskTypes\\AnalyzeImages')) { $context->registerTaskProcessingTaskType(\OCA\OpenAi\TaskProcessing\AnalyzeImagesTaskType::class); } diff --git a/lib/TaskProcessing/ImageToTextOcrProvider.php b/lib/TaskProcessing/ImageToTextOcrProvider.php new file mode 100644 index 00000000..390e19c3 --- /dev/null +++ b/lib/TaskProcessing/ImageToTextOcrProvider.php @@ -0,0 +1,194 @@ +openAiAPIService->getServiceName(); + } + + public function getTaskTypeId(): string { + if (class_exists('OCP\\TaskProcessing\\TaskTypes\\ImageToTextOpticalCharacterRecognition')) { + return \OCP\TaskProcessing\TaskTypes\ImageToTextOpticalCharacterRecognition::ID; + } + return ImageToTextOcrTaskType::ID; + } + + public function getExpectedRuntime(): int { + return $this->openAiAPIService->getExpTextProcessingTime(); + } + + public function getInputShapeEnumValues(): array { + return []; + } + + public function getInputShapeDefaults(): array { + return []; + } + + public function getOptionalInputShape(): array { + return [ + 'max_tokens' => new ShapeDescriptor( + $this->l->t('Maximum output words'), + $this->l->t('The maximum number of words/tokens that can be generated in the output.'), + EShapeType::Number + ), + 'model' => new ShapeDescriptor( + $this->l->t('Model'), + $this->l->t('The model used to generate the output'), + EShapeType::Enum + ), + ]; + } + + public function getOptionalInputShapeEnumValues(): array { + return [ + 'model' => $this->openAiAPIService->getModelEnumValues($this->userId), + ]; + } + + public function getOptionalInputShapeDefaults(): array { + $adminModel = $this->openAiSettingsService->getAdminDefaultCompletionModelId(); + return [ + 'max_tokens' => $this->openAiSettingsService->getMaxTokens(), + 'model' => $adminModel, + ]; + } + + public function getOutputShapeEnumValues(): array { + return []; + } + + public function getOptionalOutputShape(): array { + return []; + } + + public function getOptionalOutputShapeEnumValues(): array { + return []; + } + + public function process(?string $userId, array $input, callable $reportProgress): array { + if (!$this->openAiAPIService->isUsingOpenAi() && !$this->openAiSettingsService->getChatEndpointEnabled()) { + throw new RuntimeException('Must support chat completion endpoint'); + } + + if (!isset($input['input']) || !is_array($input['input'])) { + throw new RuntimeException('Invalid file list'); + } + if (count($input['input']) === 0) { + throw new RuntimeException('Invalid file list'); + } + if (count($input['input']) > 500) { + throw new RuntimeException('Too many files given. Max is 500'); + } + + if (isset($input['model']) && is_string($input['model'])) { + $model = $input['model']; + } else { + $model = $this->openAiSettingsService->getAdminDefaultCompletionModelId(); + } + + $maxTokens = null; + if (isset($input['max_tokens']) && is_int($input['max_tokens'])) { + $maxTokens = $input['max_tokens']; + } + + $fileSize = 0; + $outputs = []; + $systemPrompt = 'Extract all visible text from the image. Return only the extracted text without additional commentary. Preserve the original language of the text.'; + $userPrompt = 'Extract all text from this image.'; + + foreach ($input['input'] as $i => $file) { + if (!$file instanceof File || !$file->isReadable()) { + throw new RuntimeException('Invalid input file'); + } + $fileSize += intval($file->getSize()); + if ($fileSize > 50 * 1000 * 1000) { + throw new UserFacingProcessingException('Filesize of input files too large. Max is 50MB', userFacingMessage: $this->l->t('Filesize of input files too large. Max is 50MB')); + } + + $fileType = $file->getMimeType(); + if (!str_starts_with($fileType, 'image/')) { + throw new UserFacingProcessingException('Only supports image file types' . $fileType, userFacingMessage: $this->l->t('Only supports image file types')); + } + if ($this->openAiAPIService->isUsingOpenAi()) { + $validFileTypes = [ + 'image/jpeg', + 'image/png', + 'image/gif', + 'image/webp', + ]; + if (!in_array($fileType, $validFileTypes, true)) { + throw new RuntimeException('Invalid input file type for OpenAI ' . $fileType); + } + } + + $inputFile = base64_encode(stream_get_contents($file->fopen('rb'))); + $history = [ + json_encode([ + 'role' => 'user', + 'content' => [ + [ + 'type' => 'image_url', + 'image_url' => [ + 'url' => 'data:' . $fileType . ';base64,' . $inputFile, + ], + ], + ], + ]), + ]; + + try { + $completion = $this->openAiAPIService->createChatCompletion($userId, $model, $userPrompt, $systemPrompt, $history, 1, $maxTokens); + $messages = $completion['messages']; + + if (count($messages) === 0) { + throw new RuntimeException('No result in OpenAI/LocalAI response.'); + } + + $outputs[] = array_pop($messages); + $reportProgress(($i + 1) / count($input['input'])); + } catch (\Exception $e) { + $this->logger->warning('OpenAI/LocalAI\'s OCR failed with: ' . $e->getMessage(), ['exception' => $e]); + throw new RuntimeException('OpenAI/LocalAI\'s OCR failed with: ' . $e->getMessage()); + } + } + + return ['output' => $outputs]; + } +} diff --git a/lib/TaskProcessing/ImageToTextOcrTaskType.php b/lib/TaskProcessing/ImageToTextOcrTaskType.php new file mode 100644 index 00000000..7e898a6e --- /dev/null +++ b/lib/TaskProcessing/ImageToTextOcrTaskType.php @@ -0,0 +1,72 @@ +l->t('Optical character recognition'); + } + + /** + * @inheritDoc + */ + public function getDescription(): string { + return $this->l->t('Extract text from the given images.'); + } + + /** + * @return string + */ + public function getId(): string { + return self::ID; + } + + /** + * @return ShapeDescriptor[] + */ + public function getInputShape(): array { + return [ + 'input' => new ShapeDescriptor( + $this->l->t('Images'), + $this->l->t('Images to extract text from'), + EShapeType::ListOfFiles, + ), + ]; + } + + /** + * @return ShapeDescriptor[] + */ + public function getOutputShape(): array { + return [ + 'output' => new ShapeDescriptor( + $this->l->t('Output texts'), + $this->l->t('The texts that were extracted from the images'), + EShapeType::ListOfTexts + ), + ]; + } +}