markdown.py

text/plain • 14.79 kiB
Python script, ASCII text executable
        
            1
            """
        
            2
            This is a parser for a Markdown-like language, but it isn't compatible with
        
            3
            the CommonMark specification; check doc/enduser/Formatting messages.md for
        
            4
            its syntax.
        
            5
            
        
            6
            Roundabout - git hosting for everyone <https://roundabout-host.com>
        
            7
            Copyright (C) 2023-2025 Roundabout developers <root@roundabout-host.com>
        
            8
            
        
            9
            This program is free software: you can redistribute it and/or modify
        
            10
            it under the terms of the GNU Affero General Public License as published by
        
            11
            the Free Software Foundation, either version 3 of the License, or
        
            12
            (at your option) any later version.
        
            13
            
        
            14
            This program is distributed in the hope that it will be useful,
        
            15
            but WITHOUT ANY WARRANTY; without even the implied warranty of
        
            16
            MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
        
            17
            GNU Affero General Public License for more details.
        
            18
            
        
            19
            You should have received a copy of the GNU Affero General Public License
        
            20
            along with this program.  If not, see <http://www.gnu.org/licenses/>.
        
            21
            """
        
            22
            
        
            23
            
        
            24
            import re
        
            25
            import bs4 as beautifulsoup
        
            26
            import sys
        
            27
            
        
            28
            
        
            29
            def only_chars(string, chars):
        
            30
                chars = set(chars)
        
            31
                all_chars = set(string)
        
            32
                return all_chars.issubset(chars)
        
            33
            
        
            34
            
        
            35
            inline_regex = r"""
        
            36
            (?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\)     # hyperlink or media
        
            37
            |
        
            38
            <(?P<urlDestination2>[^<>]*)>                                                      # autolink
        
            39
            |
        
            40
            (?P<em>\*{1,7}) (?P<textEm>(?:\\\*|[^*])*) (?P=em)                                 # emphasis with * not requiring space on either side
        
            41
            |
        
            42
            (?:^|\s)(?P<em2>_{1,7}) (?P<textEm2>(?:\\.|[^*])*) (?P=em2)(?=\s|$)                # emphasis with _ requiring space on at least one side
        
            43
            |
        
            44
            [``] (?P<textCode2>(?:\\[``]|[^``])*) [``]                                          # inline code (2 backticks)
        
            45
            |
        
            46
            [`] (?P<textCode>(?:\\[`]|[^`])*) [`]                                              # inline code
        
            47
            |
        
            48
            (?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2})                            # strikethrough
        
            49
            |
        
            50
            (?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff)                      # diffs
        
            51
            """
        
            52
            
        
            53
            
        
            54
            def leading(string, character):
        
            55
                return len(string) - len(string.lstrip(character))
        
            56
            
        
            57
            
        
            58
            def trailing(string, character):
        
            59
                return len(string) - len(string.rstrip(character))
        
            60
            
        
            61
            
        
            62
            class Element:
        
            63
                def __init__(self):
        
            64
                    self.classes = []
        
            65
                    self.content = None
        
            66
                    pass
        
            67
            
        
            68
                def __repr__(self):
        
            69
                    return "Void block"
        
            70
            
        
            71
                @property
        
            72
                def tag_name(self):
        
            73
                    return "m-void"
        
            74
            
        
            75
            
        
            76
            class Container(Element):
        
            77
                def __init__(self, content):
        
            78
                    super().__init__()
        
            79
                    self.content = parse_line(content)
        
            80
            
        
            81
                def __repr__(self):
        
            82
                    return "Generic container element: " + repr(self.content)
        
            83
            
        
            84
            
        
            85
            class Rule(Element):
        
            86
                def __init__(self):
        
            87
                    super().__init__()
        
            88
            
        
            89
                def __repr__(self):
        
            90
                    return "Rule"
        
            91
            
        
            92
                @property
        
            93
                def tag_name(self):
        
            94
                    return "hr"
        
            95
            
        
            96
            
        
            97
            class HardBreak(Element):
        
            98
                def __init__(self):
        
            99
                    super().__init__()
        
            100
            
        
            101
                def __repr__(self):
        
            102
                    return "Hard break"
        
            103
            
        
            104
                @property
        
            105
                def tag_name(self):
        
            106
                    return "br"
        
            107
            
        
            108
            
        
            109
            class Heading(Container):
        
            110
                def __init__(self, content, level):
        
            111
                    super().__init__(content)
        
            112
                    self.level = level
        
            113
                    pass
        
            114
            
        
            115
                def __repr__(self):
        
            116
                    return f"Heading level {self.level}:\n\t" + repr(self.content)
        
            117
            
        
            118
                @property
        
            119
                def tag_name(self):
        
            120
                    return "h" + str(self.level)
        
            121
            
        
            122
            
        
            123
            class Paragraph(Container):
        
            124
                def __init__(self, content):
        
            125
                    super().__init__("")
        
            126
                    self.content = parse_line(content)
        
            127
            
        
            128
                def __repr__(self):
        
            129
                    return "Paragraph:\n\t" + repr(self.content)
        
            130
            
        
            131
                @property
        
            132
                def tag_name(self):
        
            133
                    return "p"
        
            134
            
        
            135
            
        
            136
            class CodeBlock(Element):
        
            137
                def __init__(self, content, language="text"):
        
            138
                    super().__init__()
        
            139
                    self.content = content
        
            140
                    self.language = language
        
            141
            
        
            142
                def __repr__(self):
        
            143
                    return f"Code block ({self.language}):\n\t" + repr(self.content)
        
            144
            
        
            145
                @property
        
            146
                def tag_name(self):
        
            147
                    return "pre"
        
            148
            
        
            149
            
        
            150
            class UnorderedList(Element):
        
            151
                def __init__(self, content):
        
            152
                    super().__init__()
        
            153
                    self.content = content
        
            154
            
        
            155
                def __repr__(self):
        
            156
                    return "Unordered list:\n\t" + repr(self.content)
        
            157
            
        
            158
                @property
        
            159
                def tag_name(self):
        
            160
                    return "ul"
        
            161
            
        
            162
            
        
            163
            class OrderedList(Element):
        
            164
                def __init__(self, content):
        
            165
                    super().__init__()
        
            166
                    self.content = content
        
            167
            
        
            168
                def __repr__(self):
        
            169
                    return "Ordered list:\n\t" + repr(self.content)
        
            170
            
        
            171
                @property
        
            172
                def tag_name(self):
        
            173
                    return "ol"
        
            174
            
        
            175
            
        
            176
            class ListItem(Element):
        
            177
                def __init__(self, content):
        
            178
                    super().__init__()
        
            179
                    self.content = tokenise(content)
        
            180
            
        
            181
                def __repr__(self):
        
            182
                    return "List item:\n\t" + repr(self.content)
        
            183
            
        
            184
                @property
        
            185
                def tag_name(self):
        
            186
                    return "li"
        
            187
            
        
            188
            
        
            189
            class Blockquote(Paragraph):
        
            190
                def __init__(self, content):
        
            191
                    super().__init__("")
        
            192
                    self.content = tokenise(content)
        
            193
            
        
            194
                def __repr__(self):
        
            195
                    return "Blockquote:\n\t" + repr(self.content)
        
            196
            
        
            197
                @property
        
            198
                def tag_name(self):
        
            199
                    return "blockquote"
        
            200
            
        
            201
            
        
            202
            class Emphasis(Container):
        
            203
                def __init__(self, content, value):
        
            204
                    super().__init__(content)
        
            205
                    self.value = value
        
            206
                    if value >= 4:
        
            207
                        self.classes.append("emphasis-3")
        
            208
                    if value % 4 >= 2:
        
            209
                        self.classes.append("emphasis-2")
        
            210
                    if value % 2:
        
            211
                        self.classes.append("emphasis-1")
        
            212
            
        
            213
                def __repr__(self):
        
            214
                    return f"Emphasis ({self.value}): " + repr(self.content)
        
            215
            
        
            216
                @property
        
            217
                def tag_name(self):
        
            218
                    return "em" if self.value == 1 else "strong"
        
            219
            
        
            220
            
        
            221
            class Code(Element):
        
            222
                def __init__(self, content):
        
            223
                    super().__init__()
        
            224
                    self.content = [content]
        
            225
            
        
            226
                def __repr__(self):
        
            227
                    return f"Inline code: {self.content}"
        
            228
            
        
            229
                @property
        
            230
                def tag_name(self):
        
            231
                    return "code"
        
            232
            
        
            233
            
        
            234
            class Strikethrough(Container):
        
            235
                def __init__(self, content):
        
            236
                    super().__init__(content)
        
            237
            
        
            238
                def __repr__(self):
        
            239
                    return f"Strikethrough: {repr(self.content)}"
        
            240
            
        
            241
                @property
        
            242
                def tag_name(self):
        
            243
                    return "s"
        
            244
            
        
            245
            
        
            246
            class Diff(Container):
        
            247
                def __init__(self, content, value):
        
            248
                    super().__init__(content)
        
            249
                    self.value = value
        
            250
            
        
            251
                def __repr__(self):
        
            252
                    return f"Diff ({self.value}): {self.content}"
        
            253
            
        
            254
                @property
        
            255
                def tag_name(self):
        
            256
                    return "ins" if self.value == "++" else "del"
        
            257
            
        
            258
            
        
            259
            class Link(Element):
        
            260
                def __init__(self, content, destination, image=False):
        
            261
                    super().__init__()
        
            262
                    self.content = parse_line(content)
        
            263
                    self.destination = destination
        
            264
                    self.image = image
        
            265
            
        
            266
                def __repr__(self):
        
            267
                    return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}"
        
            268
            
        
            269
                @property
        
            270
                def tag_name(self):
        
            271
                    return "a"
        
            272
            
        
            273
            
        
            274
            class Image(Link):
        
            275
                def __init__(self, text, destination):
        
            276
                    super().__init__(text, destination, True)
        
            277
            
        
            278
                @property
        
            279
                def tag_name(self):
        
            280
                    return "img"
        
            281
            
        
            282
            
        
            283
            def parse_line(source):
        
            284
                if trailing(source, "\\") == 1:
        
            285
                    source = source.rstrip("\\")
        
            286
                    hard_break = True
        
            287
                else:
        
            288
                    hard_break = False
        
            289
            
        
            290
                tokens = []
        
            291
                pattern = re.compile(inline_regex, re.MULTILINE | re.DOTALL | re.VERBOSE)
        
            292
                matches = pattern.finditer(source)
        
            293
            
        
            294
                lookup = 0
        
            295
                for i in matches:
        
            296
                    l = i.start()
        
            297
                    r = i.end()
        
            298
                    tokens.append(source[lookup:l])
        
            299
            
        
            300
                    lookup = r
        
            301
            
        
            302
                    if i.group("em"):
        
            303
                        tokens.append(Emphasis(i.group("textEm"), len(i.group("em"))))
        
            304
                    if i.group("em2"):
        
            305
                        tokens.append(Emphasis(i.group("textEm2"), len(i.group("em2"))))
        
            306
                    if i.group("textCode"):
        
            307
                        tokens.append(Code(i.group("textCode")))
        
            308
                    if i.group("textCode2"):
        
            309
                        tokens.append(Code(i.group("textCode2")))
        
            310
                    if i.group("strike"):
        
            311
                        tokens.append(Strikethrough(i.group("textStrike")))
        
            312
                    if i.group("diff"):
        
            313
                        tokens.append(Diff(i.group("textDiff"), i.group("diff")))
        
            314
                    if i.group("urlText"):
        
            315
                        if i.group("imageFlag"):
        
            316
                            tokens.append(Image(i.group("urlText"), i.group("urlDestination")))
        
            317
                        else:
        
            318
                            tokens.append(Link(i.group("urlText"), i.group("urlDestination")))
        
            319
                    if i.group("urlDestination2"):
        
            320
                        if "://" not in i.group("urlDestination2"):
        
            321
                            url_text = i.group("urlDestination2").partition(":")[2]    # remove tel, mailto, sms prefixes
        
            322
                            url_destination = i.group("urlDestination2")
        
            323
                            if url_destination.startswith("mailto:"):
        
            324
                                url_destination = url_destination.replace("@", "&#64;")  # prevent email harvesting
        
            325
                                url_text = url_text.replace("@", "&#64;")                # prevent protocol injection
        
            326
                        else:
        
            327
                            url_text = url_destination = i.group("urlDestination2")
        
            328
            
        
            329
                        tokens.append(Link(url_text, url_destination))
        
            330
            
        
            331
                tokens.append(source[lookup:])
        
            332
            
        
            333
                if hard_break:
        
            334
                    tokens.append(HardBreak())
        
            335
            
        
            336
                return tokens
        
            337
            
        
            338
            
        
            339
            def tokenise(source):
        
            340
                tokens = []
        
            341
            
        
            342
                current_block = Element()
        
            343
            
        
            344
                lines = [line[1:] if line.startswith(" ") else line for line in source.split("\n")]  # remove leading spaces
        
            345
            
        
            346
                i = 0
        
            347
                while i < len(lines):
        
            348
                    line = lines[i]
        
            349
                    if not line.strip() or line.startswith(";"):
        
            350
                        # Void block
        
            351
            
        
            352
                        tokens.append(current_block)
        
            353
                        current_block = Element()
        
            354
            
        
            355
                        i += 1
        
            356
                    elif only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3:
        
            357
                        # Horizontal rule
        
            358
            
        
            359
                        tokens.append(current_block)
        
            360
                        current_block = Rule()
        
            361
            
        
            362
                        i += 1
        
            363
                    elif (lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" "):
        
            364
                        if not isinstance(current_block, UnorderedList):
        
            365
                            tokens.append(current_block)
        
            366
            
        
            367
                        content = []
        
            368
            
        
            369
                        while i < len(lines) and ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")):
        
            370
                            inner_content = lines[i][2:].strip() + "\n"      # discard marker and space
        
            371
                            i += 1
        
            372
                            while i < len(lines) and lines[i].strip() and not ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1] == " "):
        
            373
                                inner_content += lines[i] + "\n"
        
            374
                                i += 1
        
            375
            
        
            376
                            content.append(ListItem(inner_content))
        
            377
            
        
            378
                        current_block = UnorderedList(content)
        
            379
                    elif re.match(r"^\d+\.", line):
        
            380
                        if not isinstance(current_block, UnorderedList):
        
            381
                            tokens.append(current_block)
        
            382
            
        
            383
                        content = []
        
            384
            
        
            385
                        while i < len(lines) and re.match(r"^ ?\d+\.", lines[i]) and len(lines[i].split(".", 1)) > 1:
        
            386
                            inner_content = lines[i].split(".", 1)[1] + "\n"      # discard number and period
        
            387
                            i += 1
        
            388
                            marker_length = len(lines[i].split(".", 1)[0]) + 1
        
            389
                            while i < len(lines) and lines[i].strip() and not re.match(r"^ ?\d+\.", lines[i]):
        
            390
                                if re.match(r"^  \d+\.", lines[i]):
        
            391
                                    marker_length = 2
        
            392
                                inner_content += lines[i][marker_length:] + "\n"
        
            393
                                i += 1
        
            394
            
        
            395
                            content.append(ListItem(inner_content))
        
            396
            
        
            397
                        current_block = OrderedList(content)
        
            398
                    elif line.startswith("#") and leading(line.lstrip("#"), " "):
        
            399
                        tokens.append(current_block)
        
            400
            
        
            401
                        content = line.lstrip("#").strip()
        
            402
                        current_block = Heading(content, leading(line, "#"))
        
            403
            
        
            404
                        i += 1
        
            405
                    elif line.startswith(">"):
        
            406
                        if not isinstance(current_block, Blockquote):
        
            407
                            tokens.append(current_block)
        
            408
            
        
            409
                        content = ""
        
            410
            
        
            411
                        while i < len(lines) and (lines[i].startswith(">") or (not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip()) and not only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3):
        
            412
                            content += lines[i].lstrip(">") + "\n"
        
            413
                            i += 1
        
            414
            
        
            415
                        current_block = Blockquote(content)
        
            416
                    elif leading(line, "~") == 3 or leading(line, "`") == 3:
        
            417
                        if not isinstance(current_block, CodeBlock):
        
            418
                            tokens.append(current_block)
        
            419
            
        
            420
                        language = line.lstrip("`~").strip()
        
            421
            
        
            422
                        content = ""
        
            423
                        i += 1        # skip the opening fence
        
            424
                        while i < len(lines) and not lines[i].strip() in ("```", "~~~"):
        
            425
                            content += lines[i] + "\n"
        
            426
                            i += 1
        
            427
            
        
            428
                        if i < len(lines):
        
            429
                            i += 1    # prevent a new block from beginning with the closing fence
        
            430
            
        
            431
                        current_block = CodeBlock(content, language=language)
        
            432
                    elif i < len(lines) - 1 and (only_chars(lines[i+1].strip(), "=") or only_chars(lines[i+1].strip(), "-")) and lines[i+1].strip():
        
            433
                        tokens.append(current_block)
        
            434
            
        
            435
                        content = line.strip()
        
            436
                        current_block = Heading(content, 1 if lines[i+1].startswith("=") else 2)
        
            437
            
        
            438
                        i += 2
        
            439
                    else:
        
            440
                        if not isinstance(current_block, Paragraph):
        
            441
                            # Create a paragraph, if there is no other specifier
        
            442
                            tokens.append(current_block)
        
            443
            
        
            444
                        content = ""
        
            445
            
        
            446
                        while (i < len(lines)
        
            447
                               and not lines[i].startswith("#")
        
            448
                               and not lines[i].startswith(">")
        
            449
                               and not lines[i].startswith(";")
        
            450
                               and not lines[i].startswith("* ")
        
            451
                               and not lines[i].startswith("+ ")
        
            452
                               and not lines[i].startswith("- ")
        
            453
                               and not lines[i].startswith("~~~")
        
            454
                               and not lines[i].startswith("```")
        
            455
                               and not re.match(r"^\d+\.", lines[i])
        
            456
                               and lines[i].strip()):
        
            457
                            content += lines[i].strip() + "\n"
        
            458
                            i += 1
        
            459
            
        
            460
                        current_block = Paragraph(content)
        
            461
            
        
            462
                tokens.append(current_block)
        
            463
            
        
            464
                return tokens
        
            465
            
        
            466
            
        
            467
            def make_html(ast):
        
            468
                soup = beautifulsoup.BeautifulSoup()
        
            469
                for i in ast:
        
            470
                    # Use bs4 to generate HTML
        
            471
                    if isinstance(i, str):
        
            472
                        soup.append(i)
        
            473
                    elif hasattr(i, "content") and i.tag_name != "m-void":
        
            474
                        tag = soup.new_tag(str(i.tag_name))
        
            475
                        if i.tag_name == "a":
        
            476
                            tag["href"] = i.destination
        
            477
                        if i.tag_name == "img":
        
            478
                            tag["src"] = i.destination
        
            479
                            tag["alt"] = " ".join(i.content)
        
            480
                        if i.tag_name == "pre":
        
            481
                            tag["data-language"] = i.language
        
            482
                        if i.classes:
        
            483
                            tag["class"] = " ".join(i.classes)
        
            484
                        try:
        
            485
                            if isinstance(i.content, list):
        
            486
                                tag.append(make_html(i.content))
        
            487
                            elif i.content and i.tag_name != "img":
        
            488
                                tag.string = i.content
        
            489
            
        
            490
                            if i.tag_name == "img":
        
            491
                                tag.string = ""
        
            492
                        except AttributeError as exc:
        
            493
                            # print(i)
        
            494
                            print(exc, file=sys.stderr)
        
            495
                        soup.append(tag)
        
            496
                return soup
        
            497
            
        
            498
            
        
            499
            def markdown2html(markdown):
        
            500
                return make_html(tokenise(markdown))
        
            501
            
        
            502