diff --git a/html/__init__.py b/html/__init__.py index c6ebbbb..af56099 100644 --- a/html/__init__.py +++ b/html/__init__.py @@ -2,13 +2,20 @@ """ Wrapper methods used for mapping HTML to docx objects +Obtained from: https://github.com/fokoenecke/html_docx """ +import re + from lxml.html import fromstring -from mindboard.helpers.docx.html.converter import DocxBuilder +from converter import DocxBuilder def add_html(container, html_string): + + # NOTE: Added for backward compatibility with line breaks in text + html_string = re.sub('\n', '
', html_string) + root = fromstring(html_string) builder = DocxBuilder(container=container) builder.from_html_tree(root=root) \ No newline at end of file diff --git a/html/converter.py b/html/converter.py index 2e51a2b..50a0883 100644 --- a/html/converter.py +++ b/html/converter.py @@ -5,7 +5,7 @@ mapping HTML tags to their corresponding python-docx functions. Appending full HTML structure to the given document. """ -from mindboard.helpers.docx.html.dispatcher import get_tag_dispatcher +from dispatcher import get_tag_dispatcher class DocxBuilder(object): @@ -38,6 +38,8 @@ def _append_docx_elements(self, html_element, container): for child in children: self._append_docx_elements(child, new_container) - dispatcher = get_tag_dispatcher(html_element.getparent().tag) + parent = html_element.getparent() + if parent is not None: + dispatcher = get_tag_dispatcher(parent.tag) if html_element.tail and dispatcher: dispatcher.append_tail(html_element, container) diff --git a/html/dispatcher.py b/html/dispatcher.py index 30919d9..e062a77 100644 --- a/html/dispatcher.py +++ b/html/dispatcher.py @@ -5,15 +5,17 @@ the different docx elements. """ -from mindboard.helpers.docx.html.tag_dispatchers.blockquote import BlockquoteDispatcher -from mindboard.helpers.docx.html.tag_dispatchers.code import CodeDispatcher -from mindboard.helpers.docx.html.tag_dispatchers.emphasis import EmphasisDispatcher -from mindboard.helpers.docx.html.tag_dispatchers.heading import HeadingDispatcher -from mindboard.helpers.docx.html.tag_dispatchers.linebreak import LineBreakDispatcher -from mindboard.helpers.docx.html.tag_dispatchers.link import LinkDispatcher -from mindboard.helpers.docx.html.tag_dispatchers.list_item import ListItemDispatcher -from mindboard.helpers.docx.html.tag_dispatchers.paragraph import ParagraphDispatcher -from mindboard.helpers.docx.html.tag_dispatchers.strong import StrongDispatcher +from tag_dispatchers.blockquote import BlockquoteDispatcher +from tag_dispatchers.code import CodeDispatcher +from tag_dispatchers.emphasis import EmphasisDispatcher +from tag_dispatchers.heading import HeadingDispatcher +from tag_dispatchers.linebreak import LineBreakDispatcher +from tag_dispatchers.link import LinkDispatcher +from tag_dispatchers.list_item import ListItemDispatcher +from tag_dispatchers.paragraph import ParagraphDispatcher +from tag_dispatchers.strong import StrongDispatcher +from tag_dispatchers.underline import UnderlineDispatcher +from tag_dispatchers.div import DivDispatcher def get_tag_dispatcher(html_tag): @@ -31,8 +33,12 @@ def get_tag_dispatcher(html_tag): a=LinkDispatcher(), li=ListItemDispatcher(), br=LineBreakDispatcher(), + div=DivDispatcher(), code=CodeDispatcher(), strong=StrongDispatcher(), + b=StrongDispatcher(), + i=EmphasisDispatcher(), + u=UnderlineDispatcher(), em=EmphasisDispatcher(), h1=heading_dispatcher, h2=heading_dispatcher, diff --git a/html/tag_dispatchers/__init__.py b/html/tag_dispatchers/__init__.py index 9c5853f..fa687fc 100644 --- a/html/tag_dispatchers/__init__.py +++ b/html/tag_dispatchers/__init__.py @@ -55,4 +55,19 @@ def replace_whitespaces(text): text = ' '.join(text.split('\n')) text = re.sub(' +', ' ', text) - return text if text else '' \ No newline at end of file + return text if text else '' + + +# From: http://stackoverflow.com/questions/3170055/test-if-lists-share-any-items-in-python +def lists_overlap(a, b): + sb = set(b) + return any(el in sb for el in a) + + +# Get parental tags (so that bold+underline+italics is dealt with correctly) +def get_parental_tags(temp_element): + temp_array = [] + while temp_element.getparent(): + temp_element = temp_element.getparent() + temp_array.append(temp_element.tag) + return temp_array diff --git a/html/tag_dispatchers/blockquote.py b/html/tag_dispatchers/blockquote.py index fa29200..630a4d0 100644 --- a/html/tag_dispatchers/blockquote.py +++ b/html/tag_dispatchers/blockquote.py @@ -1,5 +1,5 @@ # encoding: utf-8 -from mindboard.helpers.docx.html.tag_dispatchers import TagDispatcher +from ..tag_dispatchers import TagDispatcher class BlockquoteDispatcher(TagDispatcher): diff --git a/html/tag_dispatchers/code.py b/html/tag_dispatchers/code.py index d7e8b68..82d9f72 100644 --- a/html/tag_dispatchers/code.py +++ b/html/tag_dispatchers/code.py @@ -1,5 +1,5 @@ # encoding: utf-8 -from mindboard.helpers.docx.html.tag_dispatchers import TagDispatcher +from ..tag_dispatchers import TagDispatcher class CodeDispatcher(TagDispatcher): diff --git a/html/tag_dispatchers/div.py b/html/tag_dispatchers/div.py new file mode 100644 index 0000000..e03e841 --- /dev/null +++ b/html/tag_dispatchers/div.py @@ -0,0 +1,33 @@ +# encoding: utf-8 +from ..tag_dispatchers import TagDispatcher, replace_whitespaces + + +class DivDispatcher(TagDispatcher): + def __init__(self): + super(DivDispatcher, self).__init__() + + @classmethod + def append_head(cls, element, container): + paragraph = cls.get_new_paragraph(container) + return cls._append_div(element.text, element, paragraph) + + @classmethod + def append_tail(cls, element, container): + paragraph = cls.get_current_paragraph(container) + return cls._append_div(element.tail, element, paragraph) + + @classmethod + def _append_div(cls, text, element, container): + """ +
does nothing. + """ + text = replace_whitespaces(text) + if not text: + return container + + style = None + if element.getparent().tag == 'blockquote': + style = 'IntenseQuote' + + container.add_run(text=text, style=style) + return container \ No newline at end of file diff --git a/html/tag_dispatchers/emphasis.py b/html/tag_dispatchers/emphasis.py index 524ad88..0990364 100644 --- a/html/tag_dispatchers/emphasis.py +++ b/html/tag_dispatchers/emphasis.py @@ -1,5 +1,5 @@ # encoding: utf-8 -from mindboard.helpers.docx.html.tag_dispatchers import TagDispatcher, replace_whitespaces +from ..tag_dispatchers import TagDispatcher, replace_whitespaces, lists_overlap, get_parental_tags class EmphasisDispatcher(TagDispatcher): @@ -23,6 +23,8 @@ def _append_emphasis(cls, text, element, container): text = replace_whitespaces(text) run = container.add_run(text=text) run.italic = True - if element.getparent().tag == 'strong': + if lists_overlap(get_parental_tags(element), ('strong', 'b')): run.bold = True + if lists_overlap(get_parental_tags(element), ('u',)): + run.underline = True return container \ No newline at end of file diff --git a/html/tag_dispatchers/heading.py b/html/tag_dispatchers/heading.py index 752a5a6..deff774 100644 --- a/html/tag_dispatchers/heading.py +++ b/html/tag_dispatchers/heading.py @@ -1,5 +1,5 @@ # encoding: utf-8 -from mindboard.helpers.docx.html.tag_dispatchers import TagDispatcher +from ..tag_dispatchers import TagDispatcher class HeadingDispatcher(TagDispatcher): diff --git a/html/tag_dispatchers/linebreak.py b/html/tag_dispatchers/linebreak.py index a7e16cc..e679602 100644 --- a/html/tag_dispatchers/linebreak.py +++ b/html/tag_dispatchers/linebreak.py @@ -1,6 +1,6 @@ # encoding: utf-8 from docx.enum.text import WD_BREAK -from mindboard.helpers.docx.html.tag_dispatchers import TagDispatcher, replace_whitespaces +from ..tag_dispatchers import TagDispatcher, replace_whitespaces class LineBreakDispatcher(TagDispatcher): diff --git a/html/tag_dispatchers/link.py b/html/tag_dispatchers/link.py index 31bf62c..ad69031 100644 --- a/html/tag_dispatchers/link.py +++ b/html/tag_dispatchers/link.py @@ -1,5 +1,5 @@ # encoding: utf-8 -from mindboard.helpers.docx.html.tag_dispatchers import TagDispatcher +from ..tag_dispatchers import TagDispatcher class LinkDispatcher(TagDispatcher): diff --git a/html/tag_dispatchers/list_item.py b/html/tag_dispatchers/list_item.py index 85318df..dec708d 100644 --- a/html/tag_dispatchers/list_item.py +++ b/html/tag_dispatchers/list_item.py @@ -1,7 +1,7 @@ # encoding: utf-8 # map of HTML list tags and their docx styles -from mindboard.helpers.docx.html.tag_dispatchers import TagDispatcher, replace_whitespaces +from ..tag_dispatchers import TagDispatcher, replace_whitespaces _list_style = dict( ol='ListNumber', diff --git a/html/tag_dispatchers/paragraph.py b/html/tag_dispatchers/paragraph.py index 17cd08d..146ff8c 100644 --- a/html/tag_dispatchers/paragraph.py +++ b/html/tag_dispatchers/paragraph.py @@ -1,5 +1,5 @@ # encoding: utf-8 -from mindboard.helpers.docx.html.tag_dispatchers import TagDispatcher, replace_whitespaces +from ..tag_dispatchers import TagDispatcher, replace_whitespaces class ParagraphDispatcher(TagDispatcher): diff --git a/html/tag_dispatchers/strong.py b/html/tag_dispatchers/strong.py index 4b05bc3..5b282f7 100644 --- a/html/tag_dispatchers/strong.py +++ b/html/tag_dispatchers/strong.py @@ -1,5 +1,5 @@ # encoding: utf-8 -from mindboard.helpers.docx.html.tag_dispatchers import TagDispatcher, replace_whitespaces +from ..tag_dispatchers import TagDispatcher, replace_whitespaces, lists_overlap, get_parental_tags class StrongDispatcher(TagDispatcher): @@ -23,6 +23,8 @@ def _append_strong(cls, text, element, container): text = replace_whitespaces(text) run = container.add_run(text=text) run.bold = True - if element.getparent().tag == 'em': + if lists_overlap(get_parental_tags(element), ('em', 'i')): run.italic = True + if lists_overlap(get_parental_tags(element), ('u',)): + run.underline = True return container \ No newline at end of file diff --git a/html/tag_dispatchers/underline.py b/html/tag_dispatchers/underline.py new file mode 100644 index 0000000..89cd23c --- /dev/null +++ b/html/tag_dispatchers/underline.py @@ -0,0 +1,30 @@ +# encoding: utf-8 +from ..tag_dispatchers import TagDispatcher, replace_whitespaces, lists_overlap, get_parental_tags + + +class UnderlineDispatcher(TagDispatcher): + def __init__(self): + super(UnderlineDispatcher, self).__init__() + + @classmethod + def append_head(cls, element, container): + return cls._append_underline(element.text, element, container) + + @classmethod + def append_tail(cls, element, container): + return cls._append_underline(element.tail, element, container) + + @classmethod + def _append_underline(cls, text, element, container): + """ + Creates an underline text run inside the paragraph container. + Appends remainder of text as a additional run + """ + text = replace_whitespaces(text) + run = container.add_run(text=text) + run.underline = True + if lists_overlap(get_parental_tags(element), ('strong', 'b')): + run.bold = True + if lists_overlap(get_parental_tags(element), ('em', 'i')): + run.italic = True + return container \ No newline at end of file