-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
203 lines (164 loc) · 5.83 KB
/
utils.py
File metadata and controls
203 lines (164 loc) · 5.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import re
import fitz # from PyMuPDF
from docx import Document
from fpdf import FPDF
from markdown import markdown
# ---------- File Parsing ----------
def extract_text_from_pdf(file_path):
"""
Extracts and returns text from a PDF file.
"""
try:
with fitz.open(file_path) as doc:
return "\n".join(page.get_text() for page in doc)
except Exception as e:
raise RuntimeError(f"Error reading PDF: {str(e)}")
def extract_text_from_docx(file_path):
"""
Extracts and returns text from a DOCX file.
"""
try:
doc = Document(file_path)
return "\n".join(paragraph.text for paragraph in doc.paragraphs)
except Exception as e:
raise RuntimeError(f"Error reading DOCX: {str(e)}")
# ---------- Emoji Removal ----------
def remove_emojis(text):
"""
Removes emojis from the given text using Unicode pattern matching.
"""
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F"
u"\U0001F300-\U0001F5FF"
u"\U0001F680-\U0001F6FF"
u"\U0001F1E0-\U0001F1FF"
u"\U00002700-\U000027BF"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', text)
# ---------- Smart Character Cleanup ----------
def clean_text_for_pdf(text):
"""
Replaces smart quotes and dashes with their ASCII equivalents.
"""
if not isinstance(text, str):
return ""
replacements = {
"’": "'",
"‘": "'",
"“": '"',
"”": '"',
"–": "-",
"—": "-"
}
for old, new in replacements.items():
text = text.replace(old, new)
return text
# ---------- Strip Markdown ----------
def strip_markdown(md_text):
"""
Converts Markdown text to plain text by removing HTML tags.
"""
html = markdown(md_text)
clean_text = re.sub('<[^<]+?>', '', html) # remove HTML tags
return clean_text
# ---------- PDF Export ----------
class PDF(FPDF):
def header(self):
self.set_font("Times", "B", 14)
self.cell(0, 10, "AI Generated Question Paper", ln=True, align="C")
self.ln(5)
def footer(self):
self.set_y(-15)
self.set_font("Times", "I", 9)
self.cell(0, 10, f"Page {self.page_no()}", align="C")
def add_numbered_content(self, title, content_lines):
self.set_font("Times", "B", 12)
self.cell(0, 10, title, ln=True)
self.ln(2)
self.set_font("Times", "", 12)
number = 1
for i, line in enumerate(content_lines):
line = line.strip()
if not line:
continue
is_question_or_answer = (
line.endswith("?") or
(len(line) > 60 and not line.endswith(":")) or
(i > 0 and content_lines[i - 1].strip().lower().endswith("question"))
)
if is_question_or_answer:
self.multi_cell(0, 8, f"{number}. {line}")
number += 1
else:
self.multi_cell(0, 8, line)
self.ln(5)
def generate_pdf(_, full_text):
"""
Generates a PDF from the given full_text with improved formatting.
Simulates bold for MCQ and structured Answer Key layout.
"""
pdf = PDF()
pdf.set_auto_page_break(auto=True, margin=15)
# Clean and normalize content
full_text = remove_emojis(full_text)
full_text = clean_text_for_pdf(full_text)
full_text = strip_markdown(full_text)
if not isinstance(full_text, str) or len(full_text.strip()) < 20:
raise ValueError("The text content for PDF generation is invalid or too short.")
# Split content into questions and answers
parts = re.split(r'\bAnswer\s*Key\b', full_text, maxsplit=1, flags=re.IGNORECASE)
questions = parts[0].strip().splitlines()
raw_answers = parts[1].strip().splitlines() if len(parts) > 1 else []
# Filter out repeated questions in answers
answers = []
for line in raw_answers:
if re.match(r"^\d+\.\s", line):
continue
if "Difficulty:" in line or "Bloom" in line or "Marks:" in line:
continue
answers.append(line.strip())
# Page 1 - Questions
pdf.add_page()
pdf.set_font("Times", "B", 16)
pdf.cell(0, 10, "Question Paper", ln=True)
pdf.ln(5)
pdf.set_font("Times", "", 12)
q_num = 1
for line in questions:
if not line.strip():
continue
if "MCQ:" in line:
parts = line.split("MCQ:")
if len(parts) == 2:
pdf.set_font("Times", "", 12)
pdf.multi_cell(0, 8, f"{q_num}. ",)
pdf.set_font("Times", "B", 12)
pdf.multi_cell(0, 8, "MCQ:")
pdf.set_font("Times", "", 12)
pdf.multi_cell(0, 8, parts[1].strip())
q_num += 1
continue
pdf.multi_cell(0, 8, f"{q_num}. {line.strip()}")
q_num += 1
# Page 2 - Answer Key
if answers:
pdf.add_page()
pdf.set_font("Times", "B", 16)
pdf.cell(0, 10, "Answer Key", ln=True)
pdf.ln(5)
pdf.set_font("Times", "", 12)
a_num = 1
for line in answers:
if not line.strip():
continue
if ":" in line:
parts = line.split(":", 1)
pdf.set_font("Times", "B", 12)
pdf.multi_cell(0, 8, f"{a_num}. {parts[0].strip()}:")
pdf.set_font("Times", "", 12)
pdf.multi_cell(0, 8, parts[1].strip())
else:
pdf.multi_cell(0, 8, f"{a_num}. {line.strip()}")
a_num += 1
return pdf.output(dest='S').encode('latin1')