HARVEST/pdf_annotator.py at main · IntegrativeBioinformaticsLab/HARVEST · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
PDF Annotation Manager
Handles adding and retrieving highlights from PDF files using PyMuPDF (fitz)
"""

import os
import fitz  # PyMuPDF
import json
import hashlib
from typing import List, Dict, Tuple, Optional
import logging

logger = logging.getLogger(__name__)

# Security limits to prevent abuse
MAX_HIGHLIGHTS_PER_REQUEST = 50  # Maximum highlights per request
MAX_HIGHLIGHT_TEXT_LENGTH = 10000  # Maximum text length for a single highlight
MAX_FILE_SIZE = 100 * 1024 * 1024  # 100 MB max file size


def validate_highlight_data(highlight: Dict) -> Tuple[bool, Optional[str]]:
    """
    Validate a single highlight object.

    Args:
        highlight: Dictionary with highlight data

    Returns:
        Tuple of (is_valid, error_message)
    """
    required_fields = ['page', 'rects', 'color']

    # Check required fields
    for field in required_fields:
        if field not in highlight:
            return False, f"Missing required field: {field}"

    # Validate page number
    if not isinstance(highlight['page'], int) or highlight['page'] < 0:
        return False, "Invalid page number"

    # Validate rects
    if not isinstance(highlight['rects'], list) or len(highlight['rects']) == 0:
        return False, "Rects must be a non-empty list"

    # Validate each rect has 4 coordinates
    for rect in highlight['rects']:
        if not isinstance(rect, list) or len(rect) != 4:
            return False, "Each rect must have exactly 4 coordinates [x0, y0, x1, y1]"
        if not all(isinstance(x, (int, float)) for x in rect):
            return False, "All rect coordinates must be numbers"

    # Validate color (should be a hex color or RGB array)
    color = highlight['color']
    if isinstance(color, str):
        # Validate hex color
        if not color.startswith('#') or len(color) not in [4, 7]:
            return False, "Invalid color format (use #RGB or #RRGGBB)"
    elif isinstance(color, list):
        if len(color) != 3 or not all(isinstance(x, (int, float)) and 0 <= x <= 1 for x in color):
            return False, "Color as array must be [r, g, b] with values 0-1"
    else:
        return False, "Color must be a hex string or RGB array"

    # Validate optional text field length
    if 'text' in highlight:
        if not isinstance(highlight['text'], str):
            return False, "Text must be a string"
        if len(highlight['text']) > MAX_HIGHLIGHT_TEXT_LENGTH:
            return False, f"Highlight text exceeds maximum length of {MAX_HIGHLIGHT_TEXT_LENGTH}"

    return True, None


def hex_to_rgb(hex_color: str) -> Tuple[float, float, float]:
    """
    Convert hex color to RGB tuple (0-1 range).

    Args:
        hex_color: Hex color string like "#FFFF00" or "#FF0"

    Returns:
        Tuple of (r, g, b) in 0-1 range
    """
    hex_color = hex_color.lstrip('#')

    # Handle shorthand hex (#RGB -> #RRGGBB)
    if len(hex_color) == 3:
        hex_color = ''.join([c*2 for c in hex_color])

    # Convert to RGB (0-1 range)
    r = int(hex_color[0:2], 16) / 255.0
    g = int(hex_color[2:4], 16) / 255.0
    b = int(hex_color[4:6], 16) / 255.0

    return (r, g, b)


def add_highlights_to_pdf(pdf_path: str, highlights: List[Dict]) -> Tuple[bool, str]:
    """
    Add highlights to a PDF file.

    Args:
        pdf_path: Path to the PDF file
        highlights: List of highlight dictionaries with structure:
            {
                'page': int,  # Page number (0-indexed)
                'rects': [[x0, y0, x1, y1], ...],  # Rectangle coordinates
                'color': '#FFFF00' or [1.0, 1.0, 0.0],  # Highlight color
                'text': 'highlighted text'  # Optional: the text being highlighted
            }

    Returns:
        Tuple of (success: bool, message: str)
    """
    try:
        # Security: Check file size
        if not os.path.exists(pdf_path):
            return False, "PDF file not found"

        file_size = os.path.getsize(pdf_path)
        if file_size > MAX_FILE_SIZE:
            return False, f"PDF file too large (max {MAX_FILE_SIZE/1024/1024}MB)"

        # Security: Check number of highlights
        if len(highlights) > MAX_HIGHLIGHTS_PER_REQUEST:
            return False, f"Too many highlights (max {MAX_HIGHLIGHTS_PER_REQUEST} per request)"

        # Validate all highlights first
        for i, highlight in enumerate(highlights):
            is_valid, error = validate_highlight_data(highlight)
            if not is_valid:
                return False, f"Invalid highlight at index {i}: {error}"

        # Open the PDF
        doc = fitz.open(pdf_path)

        try:
            # Add each highlight
            for highlight in highlights:
                page_num = highlight['page']

                # Security: Check page number is valid
                if page_num < 0 or page_num >= len(doc):
                    logger.warning(f"Invalid page number {page_num}, skipping")
                    continue

                page = doc[page_num]

                # Convert color to RGB tuple if it's a hex string
                color = highlight['color']
                if isinstance(color, str):
                    color = hex_to_rgb(color)

                # Add highlight annotation for each rectangle
                for rect in highlight['rects']:
                    # Create a fitz.Rect from coordinates
                    fitz_rect = fitz.Rect(rect[0], rect[1], rect[2], rect[3])

                    # Add highlight annotation
                    annot = page.add_highlight_annot(fitz_rect)
                    annot.set_colors(stroke=color)

                    # Add optional text as annotation content
                    if 'text' in highlight and highlight['text']:
                        annot.set_info(content=highlight['text'][:1000])  # Limit to 1000 chars

                    annot.update()

            # Save the modified PDF with incremental save to preserve existing annotations
            doc.saveIncr()
            doc.close()

            return True, f"Successfully added {len(highlights)} highlight(s)"

        except Exception as e:
            doc.close()
            raise e

    except Exception as e:
        logger.error(f"Error adding highlights to PDF: {e}", exc_info=True)
        return False, f"Error adding highlights: {str(e)}"


def get_highlights_from_pdf(pdf_path: str) -> Tuple[bool, List[Dict], str]:
    """
    Extract existing highlights from a PDF file.

    Args:
        pdf_path: Path to the PDF file

    Returns:
        Tuple of (success: bool, highlights: List[Dict], message: str)
    """
    try:
        if not os.path.exists(pdf_path):
            return False, [], "PDF file not found"

        # Security: Check file size
        file_size = os.path.getsize(pdf_path)
        if file_size > MAX_FILE_SIZE:
            return False, [], f"PDF file too large (max {MAX_FILE_SIZE/1024/1024}MB)"

        # Open the PDF
        doc = fitz.open(pdf_path)
        highlights = []

        try:
            # Iterate through all pages
            for page_num in range(len(doc)):
                page = doc[page_num]

                # Get all annotations on this page
                for annot in page.annots():
                    # We're only interested in highlight annotations
                    if annot.type[0] == 8:  # 8 is the type code for highlight
                        # Get the highlight rectangles
                        rects = annot.vertices
                        if rects:
                            # Convert vertices to coordinate lists
                            rect_list = []
                            # Vertices come in groups of 4 points (quadrilaterals)
                            for i in range(0, len(rects), 4):
                                if i + 3 < len(rects):
                                    quad = rects[i:i+4]
                                    # Each vertex is a tuple (x, y) or a Point object
                                    # Convert to [x0, y0, x1, y1] format (bounding box)
                                    if hasattr(quad[0], 'x'):
                                        # It's a Point object
                                        x_coords = [p.x for p in quad]
                                        y_coords = [p.y for p in quad]
                                    else:
                                        # It's a tuple (x, y)
                                        x_coords = [p[0] for p in quad]
                                        y_coords = [p[1] for p in quad]

                                    rect_list.append([
                                        min(x_coords), min(y_coords),
                                        max(x_coords), max(y_coords)
                                    ])

                            # Get color
                            color_rgb = annot.colors.get("stroke", [1.0, 1.0, 0.0])

                            # Get annotation content (text)
                            text = annot.info.get("content", "")

                            highlight_data = {
                                'page': page_num,
                                'rects': rect_list,
                                'color': color_rgb,
                            }

                            if text:
                                highlight_data['text'] = text

                            highlights.append(highlight_data)

            doc.close()
            return True, highlights, f"Found {len(highlights)} highlight(s)"

        except Exception as e:
            doc.close()
            raise e

    except Exception as e:
        logger.error(f"Error reading highlights from PDF: {e}", exc_info=True)
        return False, [], f"Error reading highlights: {str(e)}"


def clear_all_highlights(pdf_path: str) -> Tuple[bool, str]:
    """
    Remove all highlight annotations from a PDF file.

    Args:
        pdf_path: Path to the PDF file

    Returns:
        Tuple of (success: bool, message: str)
    """
    try:
        if not os.path.exists(pdf_path):
            return False, "PDF file not found"

        # Security: Check file size
        file_size = os.path.getsize(pdf_path)
        if file_size > MAX_FILE_SIZE:
            return False, f"PDF file too large (max {MAX_FILE_SIZE/1024/1024}MB)"

        doc = fitz.open(pdf_path)
        count = 0

        try:
            for page_num in range(len(doc)):
                page = doc[page_num]

                # Delete all highlight annotations
                for annot in list(page.annots()):
                    if annot.type[0] == 8:  # Highlight annotation
                        page.delete_annot(annot)
                        count += 1

            # Save changes
            if count > 0:
                doc.saveIncr()

            doc.close()
            return True, f"Removed {count} highlight(s)"

        except Exception as e:
            doc.close()
            raise e

    except Exception as e:
        logger.error(f"Error clearing highlights from PDF: {e}", exc_info=True)
        return False, f"Error clearing highlights: {str(e)}"