Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion html/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,20 @@

"""
Wrapper methods used for mapping HTML to docx objects
Obtained from: https://github.com/fokoenecke/html_docx
"""

import re

from lxml.html import fromstring
from mindboard.helpers.docx.html.converter import DocxBuilder
from converter import DocxBuilder


def add_html(container, html_string):

# NOTE: Added for backward compatibility with line breaks in text
html_string = re.sub('\n', '<br>', html_string)

root = fromstring(html_string)
builder = DocxBuilder(container=container)
builder.from_html_tree(root=root)
6 changes: 4 additions & 2 deletions html/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
mapping HTML tags to their corresponding python-docx functions.
Appending full HTML structure to the given document.
"""
from mindboard.helpers.docx.html.dispatcher import get_tag_dispatcher
from dispatcher import get_tag_dispatcher


class DocxBuilder(object):
Expand Down Expand Up @@ -38,6 +38,8 @@ def _append_docx_elements(self, html_element, container):
for child in children:
self._append_docx_elements(child, new_container)

dispatcher = get_tag_dispatcher(html_element.getparent().tag)
parent = html_element.getparent()
if parent is not None:
dispatcher = get_tag_dispatcher(parent.tag)
if html_element.tail and dispatcher:
dispatcher.append_tail(html_element, container)
24 changes: 15 additions & 9 deletions html/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,17 @@
the different docx elements.
"""

from mindboard.helpers.docx.html.tag_dispatchers.blockquote import BlockquoteDispatcher
from mindboard.helpers.docx.html.tag_dispatchers.code import CodeDispatcher
from mindboard.helpers.docx.html.tag_dispatchers.emphasis import EmphasisDispatcher
from mindboard.helpers.docx.html.tag_dispatchers.heading import HeadingDispatcher
from mindboard.helpers.docx.html.tag_dispatchers.linebreak import LineBreakDispatcher
from mindboard.helpers.docx.html.tag_dispatchers.link import LinkDispatcher
from mindboard.helpers.docx.html.tag_dispatchers.list_item import ListItemDispatcher
from mindboard.helpers.docx.html.tag_dispatchers.paragraph import ParagraphDispatcher
from mindboard.helpers.docx.html.tag_dispatchers.strong import StrongDispatcher
from tag_dispatchers.blockquote import BlockquoteDispatcher
from tag_dispatchers.code import CodeDispatcher
from tag_dispatchers.emphasis import EmphasisDispatcher
from tag_dispatchers.heading import HeadingDispatcher
from tag_dispatchers.linebreak import LineBreakDispatcher
from tag_dispatchers.link import LinkDispatcher
from tag_dispatchers.list_item import ListItemDispatcher
from tag_dispatchers.paragraph import ParagraphDispatcher
from tag_dispatchers.strong import StrongDispatcher
from tag_dispatchers.underline import UnderlineDispatcher
from tag_dispatchers.div import DivDispatcher


def get_tag_dispatcher(html_tag):
Expand All @@ -31,8 +33,12 @@ def get_tag_dispatcher(html_tag):
a=LinkDispatcher(),
li=ListItemDispatcher(),
br=LineBreakDispatcher(),
div=DivDispatcher(),
code=CodeDispatcher(),
strong=StrongDispatcher(),
b=StrongDispatcher(),
i=EmphasisDispatcher(),
u=UnderlineDispatcher(),
em=EmphasisDispatcher(),
h1=heading_dispatcher,
h2=heading_dispatcher,
Expand Down
17 changes: 16 additions & 1 deletion html/tag_dispatchers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,19 @@ def replace_whitespaces(text):
text = ' '.join(text.split('\n'))
text = re.sub(' +', ' ', text)

return text if text else ''
return text if text else ''


# From: http://stackoverflow.com/questions/3170055/test-if-lists-share-any-items-in-python
def lists_overlap(a, b):
sb = set(b)
return any(el in sb for el in a)


# Get parental tags (so that bold+underline+italics is dealt with correctly)
def get_parental_tags(temp_element):
temp_array = []
while temp_element.getparent():
temp_element = temp_element.getparent()
temp_array.append(temp_element.tag)
return temp_array
2 changes: 1 addition & 1 deletion html/tag_dispatchers/blockquote.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# encoding: utf-8
from mindboard.helpers.docx.html.tag_dispatchers import TagDispatcher
from ..tag_dispatchers import TagDispatcher


class BlockquoteDispatcher(TagDispatcher):
Expand Down
2 changes: 1 addition & 1 deletion html/tag_dispatchers/code.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# encoding: utf-8
from mindboard.helpers.docx.html.tag_dispatchers import TagDispatcher
from ..tag_dispatchers import TagDispatcher


class CodeDispatcher(TagDispatcher):
Expand Down
33 changes: 33 additions & 0 deletions html/tag_dispatchers/div.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# encoding: utf-8
from ..tag_dispatchers import TagDispatcher, replace_whitespaces


class DivDispatcher(TagDispatcher):
def __init__(self):
super(DivDispatcher, self).__init__()

@classmethod
def append_head(cls, element, container):
paragraph = cls.get_new_paragraph(container)
return cls._append_div(element.text, element, paragraph)

@classmethod
def append_tail(cls, element, container):
paragraph = cls.get_current_paragraph(container)
return cls._append_div(element.tail, element, paragraph)

@classmethod
def _append_div(cls, text, element, container):
"""
<div> does nothing.
"""
text = replace_whitespaces(text)
if not text:
return container

style = None
if element.getparent().tag == 'blockquote':
style = 'IntenseQuote'

container.add_run(text=text, style=style)
return container
6 changes: 4 additions & 2 deletions html/tag_dispatchers/emphasis.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# encoding: utf-8
from mindboard.helpers.docx.html.tag_dispatchers import TagDispatcher, replace_whitespaces
from ..tag_dispatchers import TagDispatcher, replace_whitespaces, lists_overlap, get_parental_tags


class EmphasisDispatcher(TagDispatcher):
Expand All @@ -23,6 +23,8 @@ def _append_emphasis(cls, text, element, container):
text = replace_whitespaces(text)
run = container.add_run(text=text)
run.italic = True
if element.getparent().tag == 'strong':
if lists_overlap(get_parental_tags(element), ('strong', 'b')):
run.bold = True
if lists_overlap(get_parental_tags(element), ('u',)):
run.underline = True
return container
2 changes: 1 addition & 1 deletion html/tag_dispatchers/heading.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# encoding: utf-8
from mindboard.helpers.docx.html.tag_dispatchers import TagDispatcher
from ..tag_dispatchers import TagDispatcher


class HeadingDispatcher(TagDispatcher):
Expand Down
2 changes: 1 addition & 1 deletion html/tag_dispatchers/linebreak.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# encoding: utf-8
from docx.enum.text import WD_BREAK
from mindboard.helpers.docx.html.tag_dispatchers import TagDispatcher, replace_whitespaces
from ..tag_dispatchers import TagDispatcher, replace_whitespaces


class LineBreakDispatcher(TagDispatcher):
Expand Down
2 changes: 1 addition & 1 deletion html/tag_dispatchers/link.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# encoding: utf-8
from mindboard.helpers.docx.html.tag_dispatchers import TagDispatcher
from ..tag_dispatchers import TagDispatcher


class LinkDispatcher(TagDispatcher):
Expand Down
2 changes: 1 addition & 1 deletion html/tag_dispatchers/list_item.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# encoding: utf-8

# map of HTML list tags and their docx styles
from mindboard.helpers.docx.html.tag_dispatchers import TagDispatcher, replace_whitespaces
from ..tag_dispatchers import TagDispatcher, replace_whitespaces

_list_style = dict(
ol='ListNumber',
Expand Down
2 changes: 1 addition & 1 deletion html/tag_dispatchers/paragraph.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# encoding: utf-8
from mindboard.helpers.docx.html.tag_dispatchers import TagDispatcher, replace_whitespaces
from ..tag_dispatchers import TagDispatcher, replace_whitespaces


class ParagraphDispatcher(TagDispatcher):
Expand Down
6 changes: 4 additions & 2 deletions html/tag_dispatchers/strong.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# encoding: utf-8
from mindboard.helpers.docx.html.tag_dispatchers import TagDispatcher, replace_whitespaces
from ..tag_dispatchers import TagDispatcher, replace_whitespaces, lists_overlap, get_parental_tags


class StrongDispatcher(TagDispatcher):
Expand All @@ -23,6 +23,8 @@ def _append_strong(cls, text, element, container):
text = replace_whitespaces(text)
run = container.add_run(text=text)
run.bold = True
if element.getparent().tag == 'em':
if lists_overlap(get_parental_tags(element), ('em', 'i')):
run.italic = True
if lists_overlap(get_parental_tags(element), ('u',)):
run.underline = True
return container
30 changes: 30 additions & 0 deletions html/tag_dispatchers/underline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# encoding: utf-8
from ..tag_dispatchers import TagDispatcher, replace_whitespaces, lists_overlap, get_parental_tags


class UnderlineDispatcher(TagDispatcher):
def __init__(self):
super(UnderlineDispatcher, self).__init__()

@classmethod
def append_head(cls, element, container):
return cls._append_underline(element.text, element, container)

@classmethod
def append_tail(cls, element, container):
return cls._append_underline(element.tail, element, container)

@classmethod
def _append_underline(cls, text, element, container):
"""
<em> Creates an underline text run inside the paragraph container.
Appends remainder of text as a additional run
"""
text = replace_whitespaces(text)
run = container.add_run(text=text)
run.underline = True
if lists_overlap(get_parental_tags(element), ('strong', 'b')):
run.bold = True
if lists_overlap(get_parental_tags(element), ('em', 'i')):
run.italic = True
return container