Program Listing for File mdtopdf.py

Return to documentation for file (pymdtools/mdtopdf.py)

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# -----------------------------------------------------------------------------
#
# Copyright (c) 2018 Florent TOURNOIS
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
# -----------------------------------------------------------------------------

# -----------------------------------------------------------------------------
# All functions To convert markdown file to pdf
#
# -----------------------------------------------------------------------------

import logging
import sys
import os
import os.path
import re
import time
import shutil
import codecs
import pdfkit
import markdown as mkd
import PyPDF2

from . import common
from . import instruction
from . import mistunege as mistune


# -----------------------------------------------------------------------------
# Add blank pages to the pdf to have
# -----------------------------------------------------------------------------
def check_odd_pages(filename):
    filename = common.check_is_file_and_correct_path(filename,
                                                     filename_ext=".pdf")

    input_pdf = open(filename, 'rb')
    pdf = PyPDF2.PdfFileReader(input_pdf)
    num_pages = pdf.getNumPages()
    input_pdf.close()

    if num_pages % 2 == 0:
        return filename

    # add a page from the backup
    backup_filename = common.create_backup(filename)
    in_pdf = open(backup_filename, 'rb')
    pdf_init = PyPDF2.PdfFileReader(in_pdf)

    out_pdf = PyPDF2.PdfFileWriter()
    out_pdf.appendPagesFromReader(pdf_init)
    out_pdf.addBlankPage()

    out_stream = open(filename, 'wb')
    out_pdf.write(out_stream)
    out_stream.close()

    return filename

# -----------------------------------------------------------------------------
# Convert md text to html
#
# @param text the markdown text
# @return the html fragment
# -----------------------------------------------------------------------------
def converter_md_to_html_markdown(text):
    return mkd.markdown(text, output_format="xhtml5")


# -----------------------------------------------------------------------------
# Convert md text to html
#
# @param text the markdown text
# @return the html fragment
# -----------------------------------------------------------------------------
def converter_md_to_html_mistune(text):
    renderer = mistune.Renderer(use_xhtml=True)
    # use this renderer instance
    markdown = mistune.Markdown(renderer=renderer)
    return markdown(text)


# -----------------------------------------------------------------------------
# get the markdown to html converter
# -----------------------------------------------------------------------------
@common.static(__converters__=None)
def get_md_to_html_converter(converter_name):

    if get_md_to_html_converter.__converters__ is None:
        get_md_to_html_converter.__converters__ = {}
        get_md_to_html_converter.__converters__['mistune'] = \
            converter_md_to_html_mistune
        get_md_to_html_converter.__converters__['markdown'] = \
            converter_md_to_html_markdown

    if converter_name not in get_md_to_html_converter.__converters__:
        logging.info('Converter %s does not exist', converter_name)
        logging.info('Converter change to classique markdown')
        converter_name = 'markdown'

    return get_md_to_html_converter.__converters__[converter_name]


# -----------------------------------------------------------------------------
# Convert md file to html with a layout
#
# @param filename the filename of the markdon file
# @param layout the layout chosen
# @param filename_ext This parameter the markdown extension for the filename.
# @param encoding Encoding of the html output file.
# @param converter the html converter. a string with the name of the converter
# @param path_dest the destination folder for the html
# @return the html filename
# -----------------------------------------------------------------------------
def convert_md_to_html(filename, layout="jasonm23-swiss",
                       filename_ext=".md", encoding="utf-8",
                       path_dest=None, converter=None):
    logging.info('Convert md -> html %s', filename)

    filename = common.check_is_file_and_correct_path(filename, filename_ext)

    if path_dest is None:
        path_dest = os.path.split(os.path.abspath(filename))[0]
    path_dest = common.check_folder(path_dest)

    # Read the file
    content = common.get_file_content(filename)
    content_vars = instruction.get_vars_from_md_text(content)
    title = instruction.get_title_from_md_text(content)
    if title is None:
        title = ""

    if len(content) == 0:
        logging.error('The filename %s seem empty', filename)
        raise Exception('The filename %s seem empty' % filename)

    content = get_md_to_html_converter(converter)(content)

    # find the layout
    first_path = \
        os.path.join(os.path.dirname(os.path.realpath(__get_this_filename())))

    page_html_filename = \
        common.search_for_file("page.html",
                               [first_path, os.path.join(
                                   first_path, "lib", "pymdtools")],
                               [os.path.join("layouts", layout)], 1)

    layout_path = common.check_folder(os.path.dirname(page_html_filename))

    # Get the content
    page_html = common.get_file_content(page_html_filename)

    # parse instruction
    # list_inst = re.findall(r"{{.+}}", page_html)

    for inst in re.findall(r"{{.+}}", page_html):
        logging.debug('instruction %s', inst)
        if inst == '{{title}}':
            page_html = page_html.replace(inst, title)
        elif inst == '{{~> content}}':
            page_html = page_html.replace(inst, content)
        elif len(inst) > 6 and inst[0:7] == '{{asset':
            file_objet = inst[9:-3]
            if file_objet[0] == '/':
                file_objet = file_objet[1:]

            dst_file = common.set_correct_path(
                os.path.join(path_dest, file_objet))
            os.makedirs(os.path.dirname(dst_file), exist_ok=True)
            shutil.copy(common.set_correct_path(os.path.join(layout_path,
                                                             "assets",
                                                             file_objet)),
                        dst_file)
            page_html = page_html.replace(inst, file_objet)
        elif inst[2:-2] in content_vars:
            page_html = page_html.replace(inst, content_vars[inst[2:-2]])

    html_filename = common.set_correct_path(os.path.join(
        path_dest, os.path.splitext(os.path.split(filename)[1])[0] + ".html"))
    logging.info('        -> html %s', html_filename)

    # save the html file
    output_file = codecs.open(html_filename, "w",
                              encoding=encoding, errors="xmlcharrefreplace")
    output_file.write(page_html)
    output_file.close()

    return html_filename

# -----------------------------------------------------------------------------
# Find the wkhtmltopdf tool
#
# @return full path to the file "wkhtmltopdf.exe"
# -----------------------------------------------------------------------------
def find_wk_html_to_pdf():
    logging.info('Search wkhtmltopdf')

    start_points = ["C:\\Program Files\\wkhtmltopdf",
                    "./",
                    __get_this_filename(),
                    "D:\\Program Files\\wkhtmltopdf"]

    relative_paths = ['bin',
                      'wkhtmltopdf',
                      'wkhtmltopdf/bin',
                      'software/wkhtmltopdf/bin',
                      'software/wkhtmltopdf',
                      'software/bin',
                      'software',
                      'third_party_software/wkhtmltopdf/bin',
                      'third_party_software/wkhtmltopdf',
                      'third_party_software/bin',
                      'third_party_software']

    return common.search_for_file("wkhtmltopdf.exe", start_points,
                                  relative_paths, nb_up_path=4)

# -----------------------------------------------------------------------------
# Convert html file to a pdf file at the same location
#
# @return full path to the pdf file
# -----------------------------------------------------------------------------
def convert_html_to_pdf(filename, filename_ext=".html", **kwargs):
    logging.info('Convert html -> pdf %s', filename)
    filename = common.check_is_file_and_correct_path(filename, filename_ext)

    config = pdfkit.configuration(wkhtmltopdf=find_wk_html_to_pdf())

    if 'title' in kwargs and kwargs['title'] is not None:
        header_text = kwargs['title']
    else:
        header_text = '%s' % (os.path.splitext(os.path.basename(filename))[0])

    date_print = time.strftime("%d/%m/%Y", time.gmtime())
    options = {
        'header-center': header_text,
        'footer-center': 'page [page] sur [toPage]',
        'footer-font-size': '8',
        'footer-right': date_print,
        'margin-top': '20mm',
        'margin-bottom': '20mm',
        'footer-spacing': '10',
        'header-spacing': '10',
        'header-font-size': '8',
        'quiet': '',
    }

    pdf_filename = os.path.splitext(filename)[0] + ".pdf"
    pdfkit.from_file(filename, pdf_filename,
                     options=options, configuration=config)
    logging.info('Conversion finished for %s', filename)

    return pdf_filename

# -----------------------------------------------------------------------------
# Add features to the pdf
#
# @param filename the filename of the pdf
# @param filename_ext This parameter the pdf extension for the filename.
# @param kwargs all the options.
# @return the pdf filename
# -----------------------------------------------------------------------------
def pdf_features(filename, filename_ext=".pdf", **kwargs):
    logging.info('pdf features %s', filename)
    filename = common.check_is_file_and_correct_path(filename, filename_ext)

    temp_dir = common.get_new_temp_dir()
    temp_pdf_filename = os.path.join(temp_dir, os.path.basename(filename))
    shutil.copy(filename, temp_pdf_filename)

    file_in = open(temp_pdf_filename, 'rb')
    pdf_reader = PyPDF2.PdfFileReader(file_in)
    pdf_metadata = pdf_reader.getDocumentInfo()

    metadata = {}
    for key in pdf_metadata:
        metadata[key] = ''
    if 'metadata' in kwargs:
        for key in kwargs['metadata']:
            metadata['/' + key[0].upper() + key[1:]] = kwargs['metadata'][key]

    pdf_args = {}
    for key in kwargs:
        if len(key) < 4:
            continue
        if key[:4] == 'pdf_':
            local_name = kwargs[key]
            if 'path' in kwargs:
                local_name = os.path.join(kwargs['path'], local_name)
            local_name = common.check_is_file_and_correct_path(local_name)
            pdf_args[key[4:]] = PyPDF2.PdfFileReader(open(local_name, "rb"))

    num_pages = pdf_reader.getNumPages()
    pdf_writer = PyPDF2.PdfFileWriter()

    for page_number in range(num_pages):
        page = pdf_reader.getPage(page_number)
        if page_number == 0:
            if 'background_first_page' in pdf_args:
                page.mergePage(pdf_args['background_first_page'].getPage(0))
            elif 'background' in pdf_args:
                page.mergePage(pdf_args['background'].getPage(0))
        else:
            if 'background' in pdf_args:
                page.mergePage(pdf_args['background'].getPage(0))
        if 'watermark' in pdf_args:
            page.mergePage(pdf_args['watermark'].getPage(0))
        pdf_writer.addPage(page)

    pdf_writer.addMetadata(metadata)

    file_out = open(filename, 'wb')
    pdf_writer.write(file_out)

    file_out.close()
    file_in.close()
    shutil.rmtree(temp_dir)
    return filename

# -----------------------------------------------------------------------------
# Convert md file to pdf
#
# @param filename the filename of the markdon file
# @param filename_ext This parameter the markdown extension for the filename.
# @return the pdf filename
# -----------------------------------------------------------------------------
def convert_md_to_pdf(filename, filename_ext=".md", **kwargs):
    """
    This function take a file, load the content, create a pdf
    with the same name.

    @type filename: string
    @param filename: The name and path of the file to work with.
                     This file is supposed to be a markdown file.

    @type filename_ext: string
    @param filename_ext: This parameter the markdown extension
                         for the filename.

    @return nothing
    """
    logging.info('Convert md -> pdf %s', filename)
    filename = common.check_is_file_and_correct_path(filename, filename_ext)
    md_metadata = instruction.get_vars_from_md_file(filename)

    temp_dir = common.get_new_temp_dir()
    temp_md_filename = os.path.join(temp_dir, os.path.basename(filename))

    logging.info('Copy file to temp')
    shutil.copy(filename, temp_md_filename)
    logging.info('Convert md to html')
    temp_html_filename = convert_md_to_html(
        temp_md_filename, converter='mistune')

    title = None
    if 'title' in md_metadata:
        title = md_metadata['title']
    if 'page:title' in md_metadata:
        title = md_metadata['page:title']

    logging.info('Convert html to pdf title=%s', title)
    temp_pdf_filename = convert_html_to_pdf(temp_html_filename, title=title)

    logging.info('Copy file from temp')
    pdf_filename = os.path.splitext(filename)[0] + ".pdf"
    shutil.copy(temp_pdf_filename, pdf_filename)

    # remove the temp dir
    logging.info('Remove the temp dir')
    shutil.rmtree(temp_dir)

    # add features
    pdf_features(pdf_filename, filename_ext=".pdf",
                 metadata=md_metadata, **kwargs)

    return pdf_filename

# -----------------------------------------------------------------------------
# Find the filename of this file (depend on the frozen or not)
# This function return the filename of this script.
# The function is complex for the frozen system
#
# @return the filename of THIS script.
# -----------------------------------------------------------------------------
def __get_this_filename():
    result = ""

    if getattr(sys, 'frozen', False):
        # frozen
        result = sys.executable
    else:
        # unfrozen
        result = __file__

    return result